소스 검색

Merge pull request #59 from Anish-Saxena/hpc-multi-gpu

[HPC] Multi-GPU programming Models
The base version works. There are cosmetic changes suggested increasing the readability which will be taken as a separate issue by other developers
Bharatkumar Sharma 2 년 전
부모
커밋
8d77183e2a
100개의 변경된 파일35200개의 추가작업 그리고 0개의 파일을 삭제
  1. 7 0
      .gitignore
  2. 3 0
      .gitmodules
  3. 122 0
      hpc/multi_gpu_nways/README.md
  4. 64 0
      hpc/multi_gpu_nways/Singularity
  5. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/cuda_streams_overview.png
  6. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png
  7. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/domain_decomposition.png
  8. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpu_programming_process.png
  9. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_p2p.png
  10. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_rdma.png
  11. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/halo_exchange.png
  12. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/intra_node_topology_map.png
  13. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png
  14. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_events.png
  15. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png
  16. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png
  17. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/jupyter_lab_navigation.png
  18. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_gpu_util.png
  19. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_host_staging.png
  20. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_p2p_overview.png
  21. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_serialized.png
  22. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_util_selection.png
  23. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpyasync_parallel.png
  24. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_container_setup.png
  25. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png
  26. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png
  27. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png
  28. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png
  29. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_time.png
  30. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png
  31. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_large_time.png
  32. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png
  33. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_overview.png
  34. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_overview.png
  35. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_architecture.png
  36. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_dgx1_topology.png
  37. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_profiler_output.png
  38. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_cli_sample_output.png
  39. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_overview.png
  40. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_single_gpu_analysis.png
  41. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png
  42. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_topo_output.png
  43. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_left_shift_output.png
  44. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_memory_model.png
  45. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_mpi_comparison.png
  46. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_profiler_report.png
  47. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_thread_level_comm.png
  48. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/open_terminal_session.png
  49. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png
  50. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/streams_util_selection.png
  51. 272 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb
  52. 445 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/memcpy.ipynb
  53. 401 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/streams.ipynb
  54. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/.gitkeep
  55. 247 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb
  56. 382 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/cuda_aware.ipynb
  57. 430 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/memcpy.ipynb
  58. 262 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb
  59. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/.gitkeep
  60. 342 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb
  61. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nvhsmem/.gitkeep
  62. 461 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nvshmem/nvshmem.ipynb
  63. 271 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb
  64. 23 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/Makefile
  65. 456 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_memcpy.cu
  66. 451 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams.cu
  67. 470 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu
  68. 463 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_memcpy.cu
  69. 455 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams.cu
  70. 455 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams_events.cu
  71. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/.gitkeep
  72. 30 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/Makefile
  73. 24 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/Makefile
  74. 358 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_cuda_aware_mpi.cpp
  75. 97 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu
  76. 27 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/hello_world.c
  77. 361 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_cuda_aware_mpi.cpp
  78. 97 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu
  79. 375 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_memcpy_mpi.cpp
  80. 358 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_cuda_aware_mpi.cpp
  81. 378 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_memcpy_mpi.cpp
  82. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/.gitkeep
  83. 24 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/Makefile
  84. 488 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi.cpp
  85. 98 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu
  86. 406 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_nccl.cpp
  87. 407 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/solution/jacobi_nccl.cpp
  88. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/.gitkeep
  89. 29 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/Makefile
  90. 567 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/jacobi_nvshmem.cu
  91. 55 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/left_shift.cu
  92. 555 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/solution/jacobi_nvshmem.cu
  93. 22 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h
  94. 115 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h
  95. 547 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_std.h
  96. 14457 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glew.h
  97. 7125 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glext.h
  98. 597 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glut.h
  99. 1121 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxew.h
  100. 0 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxext.h

+ 7 - 0
.gitignore

@@ -2,5 +2,12 @@
 */.ipynb_checkpoints/*
 alk.traj.dcd
 *.simg
+*.so*
+*.a
+*.la
+mgpm
+*.o
+*.out
 */.ses/*
 */.log/*
+

+ 3 - 0
.gitmodules

@@ -0,0 +1,3 @@
+[submodule "hpc/multi_gpu_nways/labs/CFD/English/C/source_code/multi-gpu-programming-models"]
+	path = hpc/multi_gpu_nways/labs/CFD/English/C/source_code/multi-gpu-programming-models
+	url = https://github.com/Anish-Saxena/multi-gpu-programming-models.git

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 122 - 0
hpc/multi_gpu_nways/README.md


+ 64 - 0
hpc/multi_gpu_nways/Singularity

@@ -0,0 +1,64 @@
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
+
+Bootstrap: docker
+FROM: nvcr.io/nvidia/nvhpc:21.5-devel-cuda_multi-ubuntu20.04
+
+%environment
+    export XDG_RUNTIME_DIR=
+    export PATH="/opt/openmpi/ompi/bin/:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH"
+    export LD_LIBRARY_PATH="/opt/openmpi/ompi/lib:/pmi_utils/lib/:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/lib64/:$LD_LIBRARY_PATH"
+
+%post
+    build_tmp=$(mktemp -d) && cd ${build_tmp}
+
+    apt-get -y update
+    apt-get -y dist-upgrade 
+    DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
+	    m4 vim-nox emacs-nox nano zip\
+ 	    python3-pip python3-setuptools git-core inotify-tools \
+	    curl git-lfs \
+	    build-essential libtbb-dev
+    rm -rf /var/lib/apt/cache/* 
+
+    pip3 install --upgrade pip
+    pip3 install --no-cache-dir jupyter
+    pip3 install --no-cache-dir jupyterlab
+    pip3 install gdown
+
+    apt-get install --no-install-recommends -y build-essential 
+
+# NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
+    apt-get update -y   
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget
+    apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80
+    echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 
+    apt-get update -y 
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
+    apt-get install --no-install-recommends -y build-essential
+
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 
+    rm Miniconda3-latest-Linux-x86_64.sh 
+
+# Install CUDA-aware OpenMPI with UCX and PMI
+    mkdir -p /opt/openmpi && cd /opt/openmpi
+    wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz
+    tar -xvzf openmpi-4.1.1.tar.gz
+    mkdir -p /opt/openmpi/ompi/
+    cd /opt/openmpi/openmpi-4.1.1/
+    ./configure --prefix=/opt/openmpi/ompi/ --with-libevent=internal --with-xpmem --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/ --with-slurm --with-pmix=internal --with-pmi=/pmi_utils/ --enable-mpi1-compatibility --with-verbs --with-hcoll=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/hcoll/ --with-ucx=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/comm_libs/hpcx/hpcx-2.8.1/ucx/
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/pmi_utils/lib/"
+    make all install
+    
+    cd /
+    rm -rf ${build_tmp}
+
+%files
+    labs/ /labs
+    slurm_pmi_config/ /pmi_utils
+
+%runscript
+    "$@"
+
+%labels
+    AUTHOR Anish-Saxena

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/cuda_streams_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/domain_decomposition.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpu_programming_process.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_p2p.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpudirect_rdma.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/halo_exchange.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/intra_node_topology_map.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_p2p_report.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_events.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_report_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jacobi_memcpy_streams_events_p2p_report.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/jupyter_lab_navigation.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_gpu_util.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_host_staging.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_p2p_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_serialized.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_util_selection.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpyasync_parallel.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_container_setup.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_gdr_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_halo_exchange_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_cuda_aware_p2p_metrics.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_throughput_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_host_staging_time.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_halo_exchange_latency.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_large_time.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_nvtx_stats.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_memcpy_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/mpi_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_architecture.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_dgx1_topology.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nccl_profiler_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_cli_sample_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_single_gpu_analysis.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_topo_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_left_shift_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_memory_model.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_mpi_comparison.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_profiler_report.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvshmem_thread_level_comm.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/open_terminal_session.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/streams_util_selection.png


+ 272 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/advanced_concepts/single_node_topology.ipynb

@@ -0,0 +1,272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "251d3000",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6fa8e78",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "790904cd",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "In this lab, we will learn about:\n",
+    "\n",
+    "* Understanding intra-node GPU topology and interconnections like PCIe and NVLink\n",
+    "* Architecture overview of NVIDIA DGX 1 Tesla V100 system\n",
+    "* Comparison of communication links and their impact on application performance\n",
+    "* p2pBandwidthLatencyTest micro-benchmark for P2P performance analysis.\n",
+    "\n",
+    "# Intra-Node Communication Topology\n",
+    "\n",
+    "Let's dive deeper into how the underlying communication architecture of our system affects program performance. Run the command below to display your node's GPU and NIC communication topology:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bf585d6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da57e0dd",
+   "metadata": {},
+   "source": [
+    "If the output is unclear, you can launch a Terminal session by clicking on `File` $\\rightarrow$ Open and following the steps as shown:\n",
+    "\n",
+    "![open_terminal_session](../../images/open_terminal_session.png)\n",
+    "\n",
+    "On our DGX-1 system, the output is as follows:\n",
+    "\n",
+    "![nvidia_smi_topo_output](../../images/nvidia_smi_topo_output.png)\n",
+    "\n",
+    "Focus one a particular row, say GPU 0. The output states that GPUs 1 through 4 are connected to it via NVLink (in addition to PCIe) and GPUs 5 through 7 are connected to it via PCIe as well as an \"SMP\" interconnect. We have a dual-socket system and the CPUs in these sockets are connected by an interconnect known as SMP interconnect.\n",
+    "\n",
+    "Thus, GPU 0 to GPU 5 communication happens via not just PCIe, but also over the inter-socket interconnect within the same node. Clearly, this is a longer path than say the one between GPU 0 and GPU 1, which are connected via NVLink directly. We will discuss the NIC to GPU connection in the inter-node section of this bootcamp.\n",
+    "\n",
+    "Even within the GPUs connected via NVLink, we see different annotations such as `NV1` and `NV2` that affect the communication bandwidth and hence the performance. In this section, we will explore the nuances associated with a diverse intra-node GPU communication topology like in the output above. Specifically, in our system, the communication topology is as follows:\n",
+    "\n",
+    "![dgx1_8x_tesla_v100_topo](../../images/dgx1_8x_tesla_v100_topo.png)\n",
+    "\n",
+    "Qualitatively, the bandwidth and latency vary with the topology as follows:\n",
+    "\n",
+    "![intra_node_topology_map](../../images/intra_node_topology_map.png)\n",
+    "\n",
+    "Host staging implies traversing through the CPU and the travel path taken is one of PHB, NODE, and SYS. In contrast, if the path taken is either NV1, NV2, or PIX, then P2P is available. PXB implies that the GPUs belong to different PCIe hubs and P2P is usually not supported in this case.\n",
+    "\n",
+    "A double NVLink connection provides twice the bandwidth compared to a single NVLink. \n",
+    "\n",
+    "For a pair of 2 GPUs, the peak bidirectional bandwidth are as follows:\n",
+    "* PCIe: Using PIX topology, 15.75GB/s for PCIe Gen 3.0 and 31.5GB/s for PCIe Gen 4.0.\n",
+    "* NVLink: Using NV# topology, 50GB/s per connection. So a double NVLink connection has 100GB/s peak bidirectional bandwidth.\n",
+    "\n",
+    "Let us understand what difference the underlying communication topology can make to the application performance in the following sub-section.\n",
+    "\n",
+    "**Note:** If your command output doesn't show any NVLink connection or if there's no difference in connection type (PIX, PXB, PHB, NODE, SYS, NV#) between any 2 pair of GPUs, then the communication bandwidth and latency will likely be the same between any pair and the following sub-sections will not display any performance difference.\n",
+    "\n",
+    "## Performance variation due to system topology\n",
+    "\n",
+    "So far, the code runs the multi-GPU version on all available GPUs in a node (8 in our case). We can supply the `-gpus` runtime flag to the binary to run our code on specific GPUs. If we want to run on only 2 GPUs, namely GPU 0 and GPU 3, we use the `-gpus 0,3` argument. \n",
+    "\n",
+    "Try to find the GPU pair with highest bandwidth available as per the table above and replace `0,3` with those GPUs, and then run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93961dbc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_memcpy -p2p -gpus 0,3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0f5d159",
+   "metadata": {},
+   "source": [
+    "The efficiency would likely be higher than before due to less inter-GPU communication (each GPU does more wok instead). Our output is as follows:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 3, \n",
+    "16384x16384: 1 GPU:   4.4513 s, 2 GPUs:   2.2664 s, speedup:     1.96, efficiency:    98.20  \n",
+    "```\n",
+    "\n",
+    "Now, run the binary a pair of GPUs that have the lowest available bandwidth. In our case, we use GPU 0 and GPU 7. Our output is:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 7, \n",
+    "16384x16384: 1 GPU:   4.4529 s, 2 GPUs:   2.3454 s, speedup:     1.90, efficiency:    94.93  \n",
+    "```\n",
+    "\n",
+    "Now remove the `-p2p` flag and run the command again for GPUs 0 and 7. We didn't get any difference in performance. As you may recall, P2P is not possible between GPUs 0 and 7, so the underlying communication path doesn't change, resulting in same performance with and without the `-p2p` flag. \n",
+    "\n",
+    "The same can be confirmed by profiling the application and looking at the operations performed in the Nsight Systems timeline. \n",
+    "\n",
+    "![p2p_2_gpu_memcpy_nsys](../../images/p2p_2_gpu_memcpy_nsys.png)\n",
+    "\n",
+    "Try a few other GPU combinations and toggle P2P so see if the performance variation correlates with the table above. Also try reducing the grid size using `-nx` and `-ny` flags (to say 8192$\\times$8192) and see the effect on efficiency. \n",
+    "\n",
+    "## Benchmarking the system topology\n",
+    "\n",
+    "Our application is not very memory intensive. As is visible from the profiler output, $\\gt95\\%$ of the time in GPU is spent on computation. Therefore, to get a quantitative measure of latency and bandwidth impact due to topology, we run a micro-benchmark.\n",
+    "\n",
+    "### p2pBandwidthLatencyTest micro-benchmark\n",
+    "\n",
+    "p2pBandwidthLatencyTest is a part of [CUDA Samples GitHub repository](https://github.com/NVIDIA/cuda-samples) available to help CUDA developers. \n",
+    "\n",
+    "As the name suggests, this test measures the bandwidth and latency impact of P2P and underlying communication topology. Let's compile the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "212a8dfc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83369c1b",
+   "metadata": {},
+   "source": [
+    "Now, let's run the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59eeb793",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && ./p2pBandwidthLatencyTest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b584f5ef",
+   "metadata": {},
+   "source": [
+    "The first part of the benchmark gives device information and P2P access available from each GPU (similar to `nvidia-smi topo -m` command). Next, the benchmark measures the unidirectional and bidirectional bandwidth and latency with P2P disabled and enabled.\n",
+    "\n",
+    "We share partial results obtained in our DGX-1 system:\n",
+    "\n",
+    "```bash\n",
+    "Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 783.95   9.56  14.43  14.46  14.47  14.24  14.51  14.43 \n",
+    "\n",
+    "Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 784.87  48.49  48.49  96.85  96.90  14.25  14.54  14.49 \n",
+    "     \n",
+    "P2P=Disabled Latency Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.78  17.52  16.41  16.43  17.35  16.88  17.34  16.85 \n",
+    "     \n",
+    "P2P=Enabled Latency (P2P Writes) Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.76   1.62   1.61   2.01   2.02  18.44  19.15  19.34\n",
+    "```\n",
+    "\n",
+    "Our system is based on PCIe gen 3.0 with a peak maximum GPU-GPU PCIe banwidth of 15.75 GB/s. Let us analyze and understand these results:\n",
+    "\n",
+    "* GPU 0 and GPU 1/2: Connected by a single NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 50 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 3/4: Connected by a double NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 100 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 5/6/7: Connected by PCIe and SMP interconnect. By enabling P2P- \n",
+    "  - Bandwidth is unchanged.\n",
+    "  - Latency increases a marginally.\n",
+    "  \n",
+    "Correlate these results with the communication topology that can be displayed by usng `nvidia-smi topo -m` command and the qualtitative table in the previous section. They should be consistent with one another.\n",
+    "\n",
+    "In general, we should try to set the GPUs in an application such that a GPU can share data with its neighbours using a high-bandwidth, low-latency communication topology. Enabling P2P, when possible, usually improves the performance by eliminating host staging.\n",
+    "\n",
+    "We now have an in-depth understanding of intra-node topology and its effects on performance. Let us now analyze our P2P-enabled application again to uncover opportunities to extract more performance.\n",
+    "\n",
+    "Click on the link below to access the next lab where we discuss the need for CUDA streams and then implement them in our application.\n",
+    "\n",
+    "# [Next: CUDA Streams](../cuda/streams.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Documentation: NVIDIA DGX 1 Tesla V100 Whitepaper](https://images.nvidia.com/content/pdf/dgx1-v100-system-architecture-whitepaper.pdf)\n",
+    "* [Concepts: NVLink](https://www.nvidia.com/en-in/data-center/nvlink/)\n",
+    "* [Research: Effect of topology-awareness on communication](https://ieeexplore.ieee.org/abstract/document/7529932)\n",
+    "* [Code: p2pBandwidthLatencyTest](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/p2pBandwidthLatencyTest)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 445 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/memcpy.ipynb

@@ -0,0 +1,445 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dd0ae66a",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7d483e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e4ddba18",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "In this tutorial, the goal is to:\n",
+    "* Parallelize the single-GPU code using CUDA Memcpy and streams\n",
+    "* Understand intra-node topology and underlying technologies like GPUDirect P2P and their implication on program performance\n",
+    "\n",
+    "# Multi-GPU Programming\n",
+    "\n",
+    "In this section we first cover the principle behind decomposing data among the GPUs, known as domain decomposition. Then, we understand and implement the baseline multi-GPU code using `cudaSetDevice` and `cudaMemcpy` functions. \n",
+    "\n",
+    "### Domain Decomposition\n",
+    "\n",
+    "Before we begin, we define two important terms:\n",
+    "\n",
+    "* **Latency:** The amount of time it takes to take a unit of data from point A to point B. For example, if 4B of data can be transferred from point A to B in 4 $\\mu$s, that is the latency of transfer.\n",
+    "* **Bandwidth:** The amount of data that can be transferred from point A to point B in a unit of time. For example, if the width of the bus is 64KiB and latency of transfer between point A and B is 4 $\\mu$s, the bandwidth is 64KiB * (1/4$\\mu$s) = 1.6 GiB/s.\n",
+    "\n",
+    "To parallelize our application to multi-GPUs, we first review the different methods of domain decomposition available to us for splitting the data among the GPUs, thereby distributing the work. Broadly, we can divide data into either stripes or tiles.\n",
+    "\n",
+    "* **Stripes**: They minimize the number of neighbours, require communication among less neighbours, and are optimal for latency bound communication.\n",
+    "\n",
+    "* **Tiles**: They minimize surface area/ volume ratio of the grid, require communicating less data, and are optimal for bandwidth bound communication.\n",
+    "\n",
+    "![domain_decomposition](../../images/domain_decomposition.png)\n",
+    "\n",
+    "When we divide the global grid between GPUs, only the boundaries of each GPU-local grid need to be communicated with the neighboring GPUs, as they need the updated grid-point values for the next iteration. Therefore, we use horizontal stripes (as C/ C++ are row-major) in our tutorials for domain decomposition, enabling data parallelism.\n",
+    "\n",
+    "### Halo Exchange\n",
+    "\n",
+    "We term the exchange of top and bottom rows after each iterations the \"halo exchange\". Review the image below and notice that we update the topmost and bottomost rows of the grid to implement the periodic boundary condition. Recall that the left and right columns of the grid constitute Dirichlet boundary conditions (that is, constant value).\n",
+    "\n",
+    "![halo_exchange](../../images/halo_exchange.png)\n",
+    "\n",
+    "## CUDA concepts: Part 1\n",
+    "\n",
+    "### Setting the GPU\n",
+    "\n",
+    "To verify that our system has multiple GPUs in each node, run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49697bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62d045bd",
+   "metadata": {},
+   "source": [
+    "The command should output more than one GPU. Inside a program, the number of GPU in the node can be obtained using the `cudaGetDeviceCount(int *count)` function and to perform any task, like running a CUDA kernel, copy operation, etc. on a particular GPU, we use the `cudaSetDevice(int device)` function.\n",
+    "\n",
+    "### Copying between GPUs\n",
+    "\n",
+    "The `cudaMemcpy` function supports GPU to GPU copy using the `cudaMemcpyDeviceToDevice` flag and the source and destination memory addresses should reside in GPU devices. \n",
+    "\n",
+    "For example, if we want to copy 1000 floats from the array `arr_gpu_0` allocated on GPU 0 to the array `arr_gpu_1`, the function call is:\n",
+    "\n",
+    "```c\n",
+    "cudaMemcpy(arr_gpu_1, arr_gpu_0, 1000 * sizeof(float), cudaMemcpyDeviceToDevice);\n",
+    "```\n",
+    "\n",
+    "Recall that CUDA kernel calls made from the host are non-blocking (asynchronous) by default. That is, the control may return back to the host thread before the device kernel finishes execution. To perform the halo exchange, we need to perform copy operations between each GPU and its neighbours. However, for large copy sizes, `cudaMemcpy` is blocking with respect to the host. \n",
+    "\n",
+    "Thus, we cannot use the following code snippet:\n",
+    "\n",
+    "```c\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Set current device\n",
+    "    cudaSetDevice(i);\n",
+    "    // Define row number of top and bottom neighbours, etc.\n",
+    "    TopNeighbour = ...; BotNeighbour = ...; // and so-on\n",
+    "    // Launch device kernel on GPU i\n",
+    "    jacobi_kernel<<<dim_grid, dim_block>>>(...);\n",
+    "    // Halo exchange\n",
+    "    cudaMemcpy(grid_rows[TopNeighbour], grid_rows[myTop], size, cudaMemcpyDeviceToDevice);\n",
+    "    cudaMemcpy(grid_rows[BotNeighbour], grid_rows[myBot], size, cudaMemcpyDeviceToDevice);\n",
+    "    // Norm check, swapping current and previous grid arrays, etc.\n",
+    "} // Serializes operations with respect to the host\n",
+    "```\n",
+    "\n",
+    "As this code results in serialized execution:\n",
+    "\n",
+    "![memcpy_serialized](../../images/memcpy_serialized.png)\n",
+    "\n",
+    "### Asynchronous operations\n",
+    "\n",
+    "Instead of `cudaMemcpy`, we can use the `cudaMemcpyAsync` function which is asynchronous with respect to the host. This allows the host to launch device kernels and copy operations concurrently, enabling parallel execution across GPUs. \n",
+    "\n",
+    "The correct code snippet is as follows:\n",
+    "\n",
+    "```c\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Set current device\n",
+    "    cudaSetDevice(i);\n",
+    "    // Launch device kernel on GPU i\n",
+    "    jacobi_kernel<<<dim_grid, dim_block>>>(...);\n",
+    "}\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Define row number of top and bottom neighbours, etc.\n",
+    "    TopNeighbour = ...; BotNeighbour = ...; // and so-on\n",
+    "    // Halo exchange, notice the use of Async function\n",
+    "    cudaMemcpyAsync(grid_rows[TopNeighbour], grid_rows[myTop], size, cudaMemcpyDeviceToDevice);\n",
+    "    cudaMemcpyAsync(grid_rows[BotNeighbour], grid_rows[myBot], size, cudaMemcpyDeviceToDevice);\n",
+    "    // Norm check, swapping current and previous grid arrays, etc.\n",
+    "} // Parallel execution across multiple GPUs\n",
+    "```\n",
+    "\n",
+    "And the execution time of the application is reduced:\n",
+    "\n",
+    "![memcpyasync_parallel](../../images/memcpyasync_parallel.png)\n",
+    "\n",
+    "## Implementation exercise: Part 1\n",
+    "\n",
+    "Now, let's parallelize our code across multiple GPUs by using `cudaSetDevice` and `cudaMemcpyAsync` operations. Open the [jacobi_memcpy.cu](../../source_code/cuda/jacobi_memcpy.cu) file.\n",
+    "\n",
+    "Alternatively, you can navigate to `CFD/English/C/source_code/cuda/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi_memcpy.cu` file. \n",
+    "\n",
+    "Understand the flow of the program from within the `main` function. Review the following pre-Jacobi-computation steps:\n",
+    "\n",
+    "1. Computation of the memory chunk size to be allocated on each GPU stored in the `chunk_size` integer array.\n",
+    "2. Allocation of memory on each GPU: Notice the use of array pointers like `a_new`, `l2_norm_d`, `iy_start`, etc. that point to device arrays allocated on GPU pointed to by `dev_id` variable.\n",
+    "3. Initialization of Dirichlet boundary conditions on left and right boundaries.\n",
+    "4. Share of initial top and bottom local grid-point values between neighbours.\n",
+    "\n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 1-`:\n",
+    "\n",
+    "1. Set current GPU and call device kernel with correct device arrays in function arguments.\n",
+    "2. Asynchronously copy GPU-local L2 norm back to CPU and implement top and bottom halo exchanges.\n",
+    "3. Synchronize the devices at the end of each iteration using `cudaDeviceSynchronize` function.\n",
+    "\n",
+    "Review the topic above on Asynchronous Operations if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
+    "\n",
+    "After implementing these, let's compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce6dc6ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "698ab130",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compiler warnings or errors. Validate the implementation by running the binary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50debc4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e276f70",
+   "metadata": {},
+   "source": [
+    "The last couple of lines of the output will give the number and IDs of GPUs used, execution timings, speedup, and efficiency metrics. Review Metrics of Interest section in [single GPU overview](../single_gpu/single_gpu_overview.ipynb) tutorial for more information). We tested the code on a DGX-1 system with 8 Tesla V100 16GB GPUs, and we got the following output:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8. Using GPU ID: 0, 1, 2, 3, 4, 5, 6, 7, \n",
+    "16384x16384: 1 GPU:   4.4485 s, 8 GPUs:   1.0951 s, speedup:     4.06, efficiency:    50.78 \n",
+    "```\n",
+    "\n",
+    "Notice that we got a speed-up of $4.06\\times$ using 8 GPUs and a corresponding efficiency of $50.78\\%$. The numbers will vary depending on number of available GPUs in your system, the communication topology, GPU type, etc.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "Now, profile the execution with `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3187cdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_report --force-overwrite true ./jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4ac727d",
+   "metadata": {},
+   "source": [
+    "In the profiler timeline, the first few seconds denote the single-GPU code running on one of the GPUs. This version is executed so we can compare the multi-GPU version with it and we have already analyzed it. Let's analyze the multi-GPU timeline.\n",
+    "\n",
+    "![jacobi_memcpy_report_overview](../../images/jacobi_memcpy_report_overview.png)\n",
+    "\n",
+    "The next iteration of the device kernel is not run till all inter-GPU copy operations are complete because we need to synchronize all GPUs at the end of each iteration. The total time taken by the Jacobi Solver loop (`jacobi_solve` NVTX annotatation) is visible and is 1.278 seconds. Also, notice the we have labelled halo exchanges as Device-to-Host (DtoH) and Host-to-Device) copies. Now, right click on `CUDA HW` tab and select `Show in Events View` option. \n",
+    "\n",
+    "![jacobi_memcpy_report_events](../../images/jacobi_memcpy_report_events.png)\n",
+    "\n",
+    "The \"Source Memory Kind\" and \"Destination Memory Kind\" of the selected DtoH operation are both \"Device\". However the copy operation is marked as \"Memcpy DtoH\". By default, the device-to-device copy operation uses a temporary CPU buffer internally. Let us understand more about this CPU buffer and how we can eliminate it to improve performance.\n",
+    "\n",
+    "## CUDA concepts: Part 2\n",
+    "\n",
+    "### Host Staging of Copy Operations\n",
+    "\n",
+    "Using `cudaMemcpyAsync` instead of `cudaMemcpy` allows us to issue copy and compute operations on multiple GPUs concurrently. The path taken by the data in both the cases is denoted by the red arrow as follows:\n",
+    "\n",
+    "![memcpy_host_staging](../../images/memcpy_host_staging.png)\n",
+    "\n",
+    "That is, in the GPU-to-GPU memory copy, the data traverses from GPU 0 the PCIe bus to the CPU, where it is staged in a buffer before being copied to GPU 1. This is called \"host staging\" and it decreases the bandwidth while increasing the latency of the operation. If we eliminate host staging, we can usually improve the performance of our application.\n",
+    "\n",
+    "### Peer-to-Peer Memory Access\n",
+    "\n",
+    "P2P allows devices to address each other's memory from within device kernels and eliminates host staging by transferring data either through the PCIe switch or through NVLink as denoted by the red arrow below. \n",
+    "\n",
+    "![memcpy_p2p_overview](../../images/memcpy_p2p_overview.png)\n",
+    "\n",
+    "Peer-to-Peer (P2P) memory access requires GPUs to share a Unified Virtual Address Space (UVA). UVA means that a single address space is used for the host and all modern NVIDIA GPU devices (specifically, those with compute capibility of 2.0 or higher).\n",
+    "\n",
+    "This P2P memory access feature is supported between two devices if `cudaDeviceCanAccessPeer()` returns true for these two devices. P2P must be enabled between two devices by calling `cudaDeviceEnablePeerAccess()` as illustrated in the following code sample:\n",
+    "\n",
+    "```c\n",
+    "cudaSetDevice(currDevice);\n",
+    "int canAccessPeer = 0;\n",
+    "cudaDeviceCanAccessPeer(&canAccessPeer, currDevice, PeerDevice);\n",
+    "if (canAccessPeer) {\n",
+    "    cudaDeviceEnablePeerAccess(PeerDevice, 0);\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Note that this enables a unidirectional P2P access where `currDevice` can perform memory access to `PeerDevice`. If we want `PeerDevice` to be able to access `currDevice` via P2P, then we need to use the code accordingly.\n",
+    "\n",
+    "First, let's check if P2P is supported between the GPUs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f757d16c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -p2p r"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7afbc209",
+   "metadata": {},
+   "source": [
+    "The `topo` sub-command requests information on the GPU communication topology, `-p2p` flag requests P2P status, and `r` asks whether P2P reads are supported. Change `r` to `w` to check whether writes are supported. We share our output on a DGX-1 system with 8 Tesla V100s, focusing on the capabilities of GPU 0:\n",
+    "\n",
+    "![nvidia_smi_p2p_gpu0](../../images/nvidia_smi_p2p_gpu0.png)\n",
+    "\n",
+    "This means GPU 0 can communicate via P2P with GPUs 1 through 4. For GPUs 5 through 7, it must use host staging.\n",
+    "\n",
+    "To check whether P2P via NVLink is supported, run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1250c02c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -p2p n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d84934b",
+   "metadata": {},
+   "source": [
+    "In our DGX-1 system, the result is similar as before. Even if P2P via NVLink is not supported on your system, as long as `-p2p r` and `-p2p w` are supported between GPUs, P2P capability is available.\n",
+    "\n",
+    "## Implementation Exercise: Part 2\n",
+    "\n",
+    "Now, let us improve our program performance by enabling P2P access between GPUs, wherever possible. The `jacobi_memcpy.cu` code accepts a runtime argument `-p2p` which should enable P2P access between GPUs. \n",
+    "\n",
+    "Modify the code by searching for `TODO: Part 2` and enabling GPU `devices[dev_id]` to access peer GPUs `devices[top]` and `devices[bottom]`, whenever possible. \n",
+    "\n",
+    "Notice that the code snippet is within a `for` loop which sets and iterates over each GPU, which is why bidirectional P2P will be enabled. Take help from the code sample in the previous section.\n",
+    "\n",
+    "Now, let's compile the code again:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90e8da79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd648c93",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compiler warnings or errors. Validate the implementation by running the binary with P2P enabled:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed251978",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_memcpy -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1acc2cc0",
+   "metadata": {},
+   "source": [
+    "The output we got on our DGX-1 system is:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8. Using GPU ID: 0, 1, 2, 3, 4, 5, 6, 7, \n",
+    "16384x16384: 1 GPU:   4.4487 s, 8 GPUs:   0.8798 s, speedup:     5.06, efficiency:    63.21 \n",
+    "```\n",
+    "\n",
+    "Notice that the efficiency increased by about $8\\%$ to $63.21\\%$ compared to our baseline implementation. You can run the baseline again by removing the `-p2p` flag. Note that if P2P is not supported on your system, you will likely not experience any performance gain.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "Let us profile the execution with `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adf3e8fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_p2p_report --force-overwrite true ./jacobi_memcpy -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b801eb0",
+   "metadata": {},
+   "source": [
+    "The output we obtain is shared below:\n",
+    "\n",
+    "![jacobi_memcpy_p2p_report](../../images/jacobi_memcpy_p2p_report.png)\n",
+    "\n",
+    "For GPU 0, P2P is only possible with GPU 1 and the profiler output indeed shows only one set of P2P operations. Host-staging is used between GPU 0 and GPU 7. In contrast, GPU 2 can use P2P with both its neighbours, GPU 1 and GPU 3 and the profiler output verifies that. The events view of GPU 1 is shown. The selected operation's description shows a P2P copy operation from GPU 0 to GPU 1. Also, the total time taken for the solver loop has decreased to 1.052 seconds.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_memcpy.cu](../../source_code/cuda/solutions/jacobi_memcpy.cu)\n",
+    "\n",
+    "Let us dive deeper into the communication architecture to better understand the impact of P2P memory access. Click on the link below to access the next lab.\n",
+    "\n",
+    "# [Next: Intra-node topology](../advanced_concepts/single_node_topology.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Programming: Optimized data transfers in CUDA](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/)\n",
+    "* [Documentation: CUDA Memory Management APIs](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)\n",
+    "* [Documentation: nvidia-smi Command](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf)\n",
+    "* [Programming Concepts: Peer-to-Peer and Unified Virtual Addressing (UVA)](https://developer.download.nvidia.com/CUDA/training/cuda_webinars_GPUDirect_uva.pdf)\n",
+    "* [Programming Concepts: CUDA Peer-to-Peer Memory Access](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#peer-to-peer-memory-access)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 401 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/cuda/streams.ipynb

@@ -0,0 +1,401 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "18638d64",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ddeeccc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7c63ff6",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "We will learn about the following in this lab:\n",
+    "\n",
+    "* Concept of overlapping computation withEventEvent communication\n",
+    "* CUDA Streams overview and implementation\n",
+    "* CUDA Events overview and implementation\n",
+    "* Synchronization primitives in CUDA for the whole device, stream, event, etc.\n",
+    "\n",
+    "# Improving Application Performance\n",
+    "\n",
+    "### Analysis\n",
+    "\n",
+    "The $(i+1)^{th}$ Jacobi iteration on any GPU cannot begin until all memory operations between all GPUs at the end of $i^{th}$ iteration are complete. The GPU is idle after its memory and compute operations are completed, as is visible in the profiler output below. The white space between the blue device kernel and the orange/ green/ pink memory operations is when the GPU is idle.\n",
+    "\n",
+    "![memcpy_gpu_util](../../images/memcpy_gpu_util.png)\n",
+    "\n",
+    "Let us quantify the time loss from the profiler output. \n",
+    "\n",
+    "![memcpy_util_selection](../../images/memcpy_util_selection.png)\n",
+    "\n",
+    "On average, one iteration of `jacobi_kernel` takes about 600$\\mu$s. The copy operations take about 50$\\mu$s. The total time between Jacobi iterations is about 450$\\mu$s. So the idle time is about $450-50=400\\mu$s. \n",
+    "\n",
+    "We cannot recover all of the idle time as we are currently only considering the device timeline. Launching device kernels and copy operations has host-side overhead as well. Still, there is a significant opportunity to improve performance by minimizing the idle time.\n",
+    "\n",
+    "### Optimization\n",
+    "\n",
+    "Notice that the copy operations take place serially after the Jacobi iteration. The kernel computation must be complete before copying the updated halos from the GPU of interest (source) to its neighbours (destination).\n",
+    "\n",
+    "However, we can perform the copy operation from the neighbouring GPUs (source) to the GPU of interest (destination) concurrently with the kernel computation as it will only be required in the next iteration.\n",
+    "\n",
+    "An important optimization is to overlap computation and communication so that these operations can take place concurrently, whenever possible. We also need to keep track of dependencies so that the $(i+1)^{th}$ iteration on a GPU cannot begin until it sends and receives halos to and from its neighbours at the end of $i^{th}$ iteration.\n",
+    "\n",
+    "\n",
+    "## CUDA Concepts: Part 3\n",
+    "\n",
+    "A CUDA device has multiple \"engines\" that can concurrently manage kernel execution(s) and data transfer(s). That is, we can overlap computation and communication in our application by utilizing these engines. This requires the use of CUDA Streams.\n",
+    "\n",
+    "### Streams\n",
+    "\n",
+    "A stream in CUDA is a sequence of operations that execute on the device in the order in which they are issued by the host code. While operations within a stream are guaranteed to execute in the prescribed order, operations in different streams can be interleaved and, when possible, they can even run concurrently.\n",
+    "\n",
+    "#### The default stream\n",
+    "\n",
+    "All device operations (kernels and data transfers) in CUDA run in a stream. When no stream is specified, the default stream (also called the “null stream”) is used. All of our codes till now have implicitly used the default stream. \n",
+    "\n",
+    "The default stream is different from other streams because it is a synchronizing stream with respect to operations on the device: no operation in the default stream will begin until all previously issued operations in any stream on the device have completed, and an operation in the default stream must complete before any other operation (in any stream on the device) will begin.\n",
+    "\n",
+    "We need to use non-default streams to achieve concurrency as showcased in the image below.\n",
+    "\n",
+    "![cuda_streams_overview](../../images/cuda_streams_overview.png)\n",
+    "\n",
+    "#### Non-default streams\n",
+    "\n",
+    "Let us first learn to create and destroy non-default CUDA streams:\n",
+    "\n",
+    "```c\n",
+    "cudaStream_t stream1;\n",
+    "cudaError_t result;\n",
+    "result = cudaStreamCreate(&stream1);\n",
+    "result = cudaStreamDestroy(stream1);\n",
+    "```\n",
+    "\n",
+    "To issue a data transfer to a non-default stream we use the `cudaMemcpyAsync()` function, which takes a stream identifier as an optional fifth argument.\n",
+    "\n",
+    "```c\n",
+    "result = cudaMemcpyAsync(TopNeighbour, myTopRow, size, cudaMemcpyDeviceToDevice, stream1);\n",
+    "```\n",
+    "\n",
+    "To issue a kernel to a non-default stream we specify the stream identifier as a fourth configuration parameter. The third configuration parameter allocates shared device memory, use 0 for that. \n",
+    "\n",
+    "```c\n",
+    "jacobi_kernel<<<dim_grid, dim_block, 0, stream1>>>(...);\n",
+    "```\n",
+    "\n",
+    "#### Synchronization\n",
+    "\n",
+    "We have already encountered `cudaDeviceSynchronize()` function which blocks the host code until all previously issued operations on the device have completed. There are more fine-grained ways to synchronize codes that use streams.\n",
+    "\n",
+    "The function `cudaStreamSynchronize(stream)` can instead be used to block the host until all previously issued operations in the specified stream have completed.\n",
+    "\n",
+    "## Implementation exercise: Part 3\n",
+    "\n",
+    "Now, let's implement CUDA streams in our application. Open the [jacobi_streams.cu](../../source_code/cuda/jacobi_streams.cu) file.\n",
+    "\n",
+    "Alternatively, you can navigate to `CFD/English/C/source_code/cuda/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi_streams.cu` file.\n",
+    "\n",
+    "Note that we create 3 streams- `compute_stream`, `push_top_stream`, and `push_bottom_stream` for each GPU. We will compute the Jacobi iteration and perform GPU-local L2 norm copy operation on the `compute_stream`. Each GPU will perform its top and bottom halo copy operation to its neighbours using the `push_top_stream` and `push_bottom_stream` streams, respectively. \n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 3-`:\n",
+    "\n",
+    "1. Synchronize `push_top_stream` and `push_bottom_stream` streams to ensure \"top\" and \"bottom\" neighbours have shared updated halos from the previous iteration.\n",
+    "2. Call device kernel on `compute_stream` stream with correct device arrays in function arguments.\n",
+    "3. Asynchronously copy GPU-local L2 norm back to CPU on `compute_stream` stream.\n",
+    "4. Ensure the computation is complete by synchronizing \"compute_stream\" stream before copying the updated halos to neighbours.\n",
+    "5. Implement top and bottom halo exchanges on the correct stream.\n",
+    "\n",
+    "Review the topic above on Non-default streams if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
+    "\n",
+    "After implementing these, let's compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "003cf80a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_streams"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cc33f19",
+   "metadata": {},
+   "source": [
+    "Validate the implementation by running the binary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b71d5a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_streams -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "442c832a",
+   "metadata": {},
+   "source": [
+    "We tested the code on a DGX-1 system with 8 Tesla V100 16GB GPUs, and we got the following output:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4481 s, 8 GPUs:   0.7401 s, speedup:     6.01, efficiency:    75.13 \n",
+    "```\n",
+    "\n",
+    "Recall that the P2P-enabled application using only `cudaMemcpy` functions achieved an efficiency of about $63\\%$ on our system. We get a significant increase of efficiency to about $75\\%$ by achieving compute-communication concurrency.\n",
+    "\n",
+    "Now, enable P2P on our current program by using the `-p2p` runtime flag. On our system, the efficiency increased to $82\\%$. Your efficiency numbers and improvement in performance may differ depending on the system topology, GPU type, etc.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "Now, profile the P2P-enabled version of the program with `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0fd7b2",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_streams_p2p_report --force-overwrite true ./jacobi_streams -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6f5bf5e",
+   "metadata": {},
+   "source": [
+    "Open the report in GUI and measure the total time between two Jacobi iterations as shown below.\n",
+    "\n",
+    "![streams_util_selection](../../images/streams_util_selection.png)\n",
+    "\n",
+    "The copy operations take same time as before, about 50$\\mu$s. Thus, the idle time is $200-50=150\\mu$s. Compare this idle time with the idle time for non-streams version of the application, which in our case is abour 400$\\mu$s. Concurrency improves GPU utilization and consequently speedup and efficiency.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_streams.cu](../../source_code/cuda/solutions/jacobi_streams.cu)\n",
+    "\n",
+    "#### Analysis\n",
+    "\n",
+    "Can we improve our program further? Yes! Can you think of any bottleneck that we have mentioned implicitly but haven't addressed yet? \n",
+    "\n",
+    "Recall that `cudaStreamSynchronize` function blocks the \"host\" until all previously issued operations in the specified stream have completed. Do we need to block the host?\n",
+    "\n",
+    "The utility of this function in our application is that it ensures the dependencies between iterations and between computation and communication are respected. We don't need to block the host for this purpose. \n",
+    "\n",
+    "## CUDA Concepts: Part 4\n",
+    "\n",
+    "### CUDA Events\n",
+    "\n",
+    "CUDA Events are synchronization markers that provide a mechanism to signal when operations have occurred \n",
+    "in a stream. They allow fine grained synchronization within a stream and also inter stream synchronization, e.g. let a stream wait for an event in another stream. \n",
+    "\n",
+    "Let us first learn to create and destroy CUDA events:\n",
+    "\n",
+    "```c\n",
+    "cudaEvent_t event1;\n",
+    "cudaError_t result;\n",
+    "result = cudaEventCreate(&event1);\n",
+    "result = cudaEventDestroy(&event1);\n",
+    "```\n",
+    "\n",
+    "#### Recording Events\n",
+    "\n",
+    "Events have a boolean state- Occurred or Not Occurred. The default state is Occurred. We record an event as follows:\n",
+    "\n",
+    "```c\n",
+    "cudaEventRecord(&event1, stream1); \n",
+    "```\n",
+    "\n",
+    "This function sets the event state of `event1` to Not Occurred, enqueues `event1` into queue at `stream1`, and the event state is set to Occurred when it reaches the front of the queue at `stream1`.\n",
+    "\n",
+    "#### Synchronizing Stream with Events\n",
+    "\n",
+    "`cudaEventSynchronize` acts similar to `cudaStreamSynchronize` and blocks the host until the recorded event has \"Occured\". But we do not wish to block the host thread. Thus, we use `cudaStreamWaitEvent`:\n",
+    "\n",
+    "```c\n",
+    "cudaStreamWaitEvent(stream1, event1, 0);\n",
+    "```\n",
+    "\n",
+    "This function blocks the stream until `event1` has Occured and it does not block the host. It works even if the event is recorded in a different stream or on a different device.\n",
+    "\n",
+    "Thus, fine-grained synchronization that doesn't block the host is achieved by first using `cudaEventRecord` on the independent operation, for example, halo copy from GPU 0 to GPU 1 at the end of $i^{th}$ iteration. Then, before issuing the dependent operation, for example, Jacobi computation for $(i+1)^{th}$ iteration on GPU 1, we block the stream using `cudaStreamWaitEvent`.  \n",
+    "\n",
+    "## Implementation Exercise: Part 4\n",
+    "\n",
+    "Let's implement CUDA Events with Streams in our application. Open the [jacobi_streams_events.cu](../../source_code/cuda/jacobi_streams_events.cu) file.\n",
+    "\n",
+    "Alternatively, you can navigate to `CFD/English/C/source_code/cuda/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi_streams_events.cu` file.\n",
+    "\n",
+    "Note that we create 5 events for each device, `compute_done`, `push_top_done[0]`, `push_top_done[1]`, `push_bottom_done[0]`, and `push_bottom_done[1]`. We need 2 events for each halo on every device:\n",
+    "\n",
+    "1. To synchronize \"top\" and \"bottom\" neighbour's `push_bottom_stream` and `push_top_stream` copy operations of $(i-1)^{th}$ iteration, respectively, before computing $i^{th}$ Jacobi iteration in `compute_stream`.\n",
+    "2. To record current device's `push_top_stream` and `push_bottom_stream` copy operations at the end of $i^{th}$ iteration.\n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 4-`:\n",
+    "\n",
+    "* Block the \"compute_stream\" as long as the top and bottom halos from the neighbours are not copied to `dev_id`. The `push_top_done` and `push_bottom_done` events are to monitored for `bottom` and `top` neighbours, respectively for the previous iteration denoted by `iter % 2`. Note that there should be 2 distinct `cudaStreamWaitEvent` function calls.\n",
+    "* Record that Jacobi computation on `compute_stream` is done by using `cudaEventRecord` for `compute_done` event for `dev_id`.\n",
+    "* Wait for the Jacobi computation of `dev_id` to complete by using the `compute_done` event on `push_top_stream` so that the top halo isn't copied to the neighbour before computation is done.\n",
+    "* Record completion of top halo copy from `dev_id` to its neighbour to be used in next iteration. Record the event for `push_top_done` stream of `dev_id` for next iteration which is `(iter+1) % 2`.\n",
+    "* Repeat the same procedure as described in previous two points for bottom halo copy with `push_bottom_stream` and `push_bottom_done` event.\n",
+    "\n",
+    "After implementing these, compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b299e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && make clean && make jacobi_streams_events"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9b942c7",
+   "metadata": {},
+   "source": [
+    "Validate the implementation by running the binary with and without P2P:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35e57643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/cuda && ./jacobi_streams_events -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c70e07a9",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "We share the partial output from our DGX-1 8 Tesla V100 system for the binary without using P2P:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4485 s, 8 GPUs:   0.6640 s, speedup:     6.70, efficiency:    83.75 \n",
+    "```\n",
+    "\n",
+    "With using P2P, the efficiency increases marginally:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8.\n",
+    "16384x16384: 1 GPU:   4.4486 s, 8 GPUs:   0.6528 s, speedup:     6.81, efficiency:    85.18 \n",
+    "```\n",
+    "\n",
+    "Let us profile the code to verify that using events indeed overlaps computation with communication within each GPU.\n",
+    "\n",
+    "## Profiling\n",
+    "\n",
+    "Profile the binary with P2P enabled using `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c910f6f-f58c-4d3b-ab37-49dbc4112751",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/cuda/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_streams_events_p2p_report --force-overwrite true ./jacobi_streams_events -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e330889-77e3-4fe3-9782-b4a13425c9bb",
+   "metadata": {},
+   "source": [
+    "Download the `.qdrep` report file and open it in the Nsight Systems GUI application:\n",
+    "\n",
+    "![jacobi_memcpy_streams_events_p2p_report](../../images/jacobi_memcpy_streams_events_p2p_report.png)\n",
+    "\n",
+    "Observe that the computation is now overlapped with communication within each GPU. Moreover, we have decreased the total idle time between two Jacobi iterations to about $175\\mu$s. Therefore, the GPU idle time is $175-50=125\\mu$s, which is lesser than the $150\\mu$s idle time achieved using just streams.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/memcpy/solutions` directory: [jacobi_streams_events.cu](../../source_code/cuda/solutions/jacobi_streams_events.cu)\n",
+    "\n",
+    "We have now covered implementing computation and communication overlap using CUDA Streams and then fine-tuning it using CUDA Events. Note that all of our codes currently are confined to a single node. We would like to scale our codes across nodes.\n",
+    "\n",
+    "Therefore, let us learn about multi-node multi-GPU programming with MPI. Click bellow to access the next lab:\n",
+    "\n",
+    "# [Next: Multi-Node programming with MPI](../mpi/multi_node_intro.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Programming Concepts: CUDA Streams and Concurrency](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf)\n",
+    "* [Programming Concepts: CUDA Events and Performance Monitoring](https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/)\n",
+    "* [Programming: CUDA Streams Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams-cdp)\n",
+    "* [Concepts: Overlapping Computation and Communication](https://developer.nvidia.com/blog/how-overlap-data-transfers-cuda-cc/)\n",
+    "* [Documentation: CUDA Stream Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html)\n",
+    "* [Documentation: CUDA Events Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/.gitkeep


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 247 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/containers_and_mpi.ipynb


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 382 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/cuda_aware.ipynb


+ 430 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/memcpy.ipynb

@@ -0,0 +1,430 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4ecc207b-52c7-463a-8731-19203d384a30",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d6d1387-f525-40d4-bf3a-f7403bdce2b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9d6f0d-cfa6-4ffd-b970-bee700bf1a90",
+   "metadata": {},
+   "source": [
+    "**Note:** Execution results can vary significantly based on the MPI installation, supporting libraries, workload manager, and underlying CPU and GPU hardware configuration and topology. The codes in this lab have been tested on DGX-1 8 Tesla V100 16 GB nodes connected by Mellanox InfiniBand NICs running OpenMPI v4.1.1 with HPCX 2.8.1 and CUDA v11.3.0.0.\n",
+    "\n",
+    "# Learning Objectives\n",
+    "\n",
+    "We will learn about the following in this lab:\n",
+    "\n",
+    "* Point-to-point and collective MPI communication routines.\n",
+    "* Managing the two-level hierarchy created by global and local rank of a process and how it accesses GPU(s).\n",
+    "* OpenMPI process mappings and its effect on application performance.\n",
+    "\n",
+    "## MPI Inter-Process Communication\n",
+    "\n",
+    "Let us learn more about how MPI communicates between processes.\n",
+    "\n",
+    "### Point-to-Point communication\n",
+    "\n",
+    "Two MPI processes can communicate directly (point-to-point) by sending and receiving data packets to and from each other. Both the sender and receivers processes must acknowledge the transaction using `MPI_Send` and `MPI_Recv` functions. MPI allows tagging messages to differenciate between various messages that processes may send to each other.\n",
+    "\n",
+    "The function syntax for `MPI_Send` is:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Send(void* data, int count, MPI_Datatype datatype, int destination, \n",
+    "         int tag, MPI_Comm communicator);\n",
+    "```\n",
+    "\n",
+    "Similarly, the syntax for `MPI_Recv` is:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Recv(void* data, int count, MPI_Datatype datatype, int source, int tag,\n",
+    "         MPI_Comm communicator, MPI_Status* status);\n",
+    "```\n",
+    "   \n",
+    "A simple 2-process send-receive code is as follows:\n",
+    "\n",
+    "```c\n",
+    "int data;\n",
+    "if (rank == 0) {\n",
+    "    data = -1;\n",
+    "    MPI_Send(&data, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);\n",
+    "} else if (rank == 1) {\n",
+    "    MPI_Recv(&data, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "There are several other functions to send and receive data synchronously and asynchronously. In particular, we will make use of `MPI_SendRecv` function which sends and receives a message, and whose syntax is as follows:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,\n",
+    "                int dest, int sendtag,\n",
+    "                void *recvbuf, int recvcount, MPI_Datatype recvtype,\n",
+    "                int source, int recvtag,\n",
+    "                MPI_Comm comm, MPI_Status *status);\n",
+    "```\n",
+    "\n",
+    "### Collective communication\n",
+    "\n",
+    "Collective communication involves participation of all processes in a communicator. It implies an implicit synchronization point among processes. Depending on the requirement, we can peform broadcast, scatter, gather, reduce, and other operations between the participating processes. \n",
+    "\n",
+    "In our application, we would like to reduce all the rank-local norms to a single global norm using the sum operation. We use the `MPI_Allreduce` function for it which combines and reduces values from all processes and distributes the result back to all processes, and whose syntax is as follows:\n",
+    "\n",
+    "```c\n",
+    "int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,\n",
+    "                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);\n",
+    "```\n",
+    "\n",
+    "The `op` in our case will be `MPI_SUM`.\n",
+    "\n",
+    "## Communication Models\n",
+    "\n",
+    "We will use multiple ranks within our program as we will use multiple nodes. There are three major approaches to handle GPUs within a node:\n",
+    "\n",
+    "1. Single GPU per rank\n",
+    "  * One process controls one GPU.\n",
+    "  * Easier to program and understand.\n",
+    "  * We can re-use our domain decomposition approach.\n",
+    "\n",
+    "\n",
+    "2. Multiple GPUs per rank\n",
+    "  * Usually, all GPUs within a node are handled by one process.\n",
+    "  * Coordinating between GPUs is quite tricky as CUDA-based communication is intertwined with MPI communication.\n",
+    "  * Requires a new decomposition for the two-tier communication hierarchy (MPI and CUDA).\n",
+    "\n",
+    "\n",
+    "3. Single GPU per multiple ranks\n",
+    "  * Multiple processes use the same GPU and number of processes in a node is usually equal to number of cores.\n",
+    "  * Intended for heterogeneous codes where both CPU and GPU accelerate the application.\n",
+    "  * CUDA Multi-Process-Service (MPS) is required to allow multiple CUDA processes to share a single GPU context.\n",
+    "  \n",
+    "We will take the first approach due to its simplicity (which eliminates approach #2) and because our application doesn't utilize CPU for compute (which eliminates approach #3). Thus our rank (core) to GPU mapping is one-to-one, as follows:\n",
+    "\n",
+    "![mpi_overview](../../images/mpi_overview.png)\n",
+    "\n",
+    "### Nodel-Level Local Rank\n",
+    "\n",
+    "As we will run on multiple nodes, for example 2, the number of processes launched, 16, will not map one-to-one with GPU Device ID, which runs from 0 to 7 on each node. Thus, we need to create a local rank at the node level.\n",
+    "\n",
+    "To achieve this, we split the `MPI_COMM_WORLD` communicator between the nodes and store it in a `local_comm` communicator. Then, we get the local rank by calling the familiar `MPI_Comm_rank` function. Finally, we free the `local_comm` communicator as we don't require it anymore. \n",
+    "\n",
+    "The code snippet to obtain the `local_rank` at each node level is as follows:\n",
+    "\n",
+    "```c\n",
+    "int local_rank = -1;\n",
+    "MPI_Comm local_comm;\n",
+    "MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &local_comm);\n",
+    "MPI_Comm_rank(local_comm, &local_rank);\n",
+    "MPI_Comm_free(&local_comm);\n",
+    "```\n",
+    "\n",
+    "## Implementation Exercise: Part 1\n",
+    "\n",
+    "### Code Structure\n",
+    "\n",
+    "Open the [jacobi_memcpy_mpi.cpp](../../source_code/mpi/jacobi_memcpy_mpi.cpp) file and the [jacobi_kernels.cu](../../source_code/mpi/jacobi_kernels.cu) files. Alternatively, you can navigate to `CFD/English/C/source_code/mpi/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi_memcpy_mpi.cpp` and `jacobi_kernels.cu` files.\n",
+    "\n",
+    "We separate the device kernels from other CUDA and MPI functions as `nvc++` compiler is required to compile CUDA C++ which may not be installed on some platforms Note that NVIDIA's HPC SDK includes the `nvc++` compiler.\n",
+    "\n",
+    "Review the [Makefile](../../source_code/mpi/Makefile) to see that we compile the CUDA kernels using `nvcc` and link the object file with `jacobi_memcpy_mpi.cpp` using `mpicxx` compiler as follows:\n",
+    "\n",
+    "```bash\n",
+    "# Compiling jacobi_kernels.cu\n",
+    "nvcc -gencode arch=compute_80,code=sm_80 -std=c++14 jacobi_kernels.cu -c\n",
+    "# Compiling and linking with jacobi_cuda_aware_mpi.cpp\n",
+    "mpicxx -I${CUDA_HOME}/include -fopenmp -std=c++14 jacobi_cuda_aware_mpi.cpp jacobi_kernels.o \\\n",
+    "        -L${CUDA_HOME}/lib64 -lcudart -lnvToolsExt -o jacobi_cuda_aware_mpi\n",
+    "```\n",
+    "\n",
+    "The device kernels are same as in previous labs. Open `jacobi_memcpy_mpi.cpp` file and understand the flow of the program. In particular, observe the following:\n",
+    "\n",
+    "1. `local_rank` is used to set the current GPU device.\n",
+    "2. Device kernel calls have been replaced with function wrappers for ease of compilation.\n",
+    "3. Rank 0 is used to calculate efficiency and other metrics, even though all ranks compute `single_gpu` function to verify multi-GPU implementation's correctness.\n",
+    "4. In the first set of halo exchanges, `top_halo_buf` stores the top halo copied from the device on the host which is then sent to top neighbour. Whereas `bot_halo_buf` stores the updated bottom halo received from bottom neighbour that is then copied to the device from the host.\n",
+    "5. In the second set of halo exchanges, `top_halo_buf` stores the updated top halo received from the top neighbour that is then copied to the device from the host. Whereas `bot_halo_buf` stores the bottom halo copied from the device to the host that is then sent to the bottom neighbour.\n",
+    "6. Each halo exchange is wrapped in NVTX \"Halo exchange Memcpy+MPI\" for ease of viewing in profiler.\n",
+    "\n",
+    "### To-Do\n",
+    "\n",
+    "Now, implement the following marked as `TODO: Part 1-`:\n",
+    "\n",
+    "* Obtain the node-level local rank by splitting the global communicator.\n",
+    "* Implement the MPI portion of first set of halo exchanges using `MPI_SendRecv` as explained above.\n",
+    "* Implement the Memcpy operations and MPI calls for the second set of halo exchanges. Recall why `cudaMemcpyAsync` is not the correct way of implementing this MPI program.\n",
+    "* Reduce the rank-local L2 Norm to a global L2 norm using `MPI_Allreduce` function.\n",
+    "\n",
+    "After implementing these, compile the program:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57db8407-a720-4f19-9666-d0b1b37c6a1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && make clean && make jacobi_memcpy_mpi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bd0ae58-b7bf-432d-8ced-367daaefbc7d",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compilation errors. Now, let us validate the program. \n",
+    "\n",
+    "The grid-size of 16384$\\times$16384 has been selected such that all 8 GPUs are fully utilized. To test with 16 GPUs, we increase the grid size to 16384$\\times$32768 to maintain the invariant that GPUs are not under-utilized. Observe that the halo exchange copy size remains the same as before (16K elements * size of float (4B) = 64KB).\n",
+    "\n",
+    "Run the program with 16 processes across 2 nodes as follows:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ddb18d3-868f-4dc8-b3c6-7225cc367135",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 -npersocket 4 ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9bc18c9-a836-4503-a874-327293fd7d0b",
+   "metadata": {},
+   "source": [
+    "We share the partial output from 2 DGX-1 nodes with 8 Tesla V100-16GB each connected by InfiniBand (IB) NICs:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 16.\n",
+    "16384x32768: 1 GPU:   8.9057 s, 16 GPUs:   0.7695 s, speedup:    11.57, efficiency:    72.34 \n",
+    "```\n",
+    "For reference, we also share the output from 4 DGX-1 nodes with 16K$\\times$64K grid size ($4\\times$ the single-node's grid size):\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 32.\n",
+    "16384x65536: 1 GPU:  17.6316 s, 32 GPUs:   0.8526 s, speedup:    20.68, efficiency:    64.62\n",
+    "```\n",
+    "\n",
+    "As the communication overhead increases due to more inter-node communication, the speed-up obtained and thus the efficiency of the application decreases. Nonetheless, our program can scale across mutliple nodes.\n",
+    "\n",
+    "### OpenMPI Process Mappings\n",
+    "\n",
+    "As we mentioned in previous labs, there are multiple ways to specify the number of processes to be run on each socket, node, etc. One such way is to use `--map-by` option. Mapping assigns a default location to each process.  To specify that we want each socket to run 4 processes, we use `--map-by ppr:4:socket` flag. Here, `ppr` stands for processes-per-resource, where the spcified resource is `socket` and the spcified number of processes is `4`. \n",
+    "\n",
+    "It is similar to using the `-npersocket 4` option. Run the following command and validate that the results obtained is the same:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f874e966-53ad-4251-8059-76697ef6862e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0765be26-807b-4ee5-ae75-9e9a6a14c293",
+   "metadata": {},
+   "source": [
+    "We can also use the `--map-by ppr:8:node:4:socket` flag. Here, in addition to specifying the number of processes per socket, we also specify the number of processes per node. This should result in the same execution and results. So, run the following command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec11f8bf-1948-48d5-8eac-f6a655e1d369",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:8:node:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07f62d8e-3584-414e-9203-e4d24961c2bc",
+   "metadata": {},
+   "source": [
+    "Notice that our efficiency has decreased. We share our partial results:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 16.\n",
+    "16384x32768: 1 GPU:   8.9050 s, 16 GPUs:   0.8150 s, speedup:    10.93, efficiency:    68.2\n",
+    "```\n",
+    "\n",
+    "Compare it with the previous result and notice the increase in multi-node execution time and corresponding decrease in efficiency. Let us check what cores or sockets or nodes each process (or MPI rank) is bound to. Binding constrains each process to run on specific processors. We use the `--report-bindings` option to check this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b79fbfcf-ca8a-4e86-9ef2-9c6f3387ddea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:8:node:4:socket --report-bindings ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37547338-af62-4329-b65c-9b2c0b45130f",
+   "metadata": {},
+   "source": [
+    "The output may seem cluttered, so let us focus on partial output from ranks 0 and 1:\n",
+    "\n",
+    "```bash\n",
+    "[<node_0_name>:<proc_id>] MCW rank 0 bound to socket 0 ... [BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB][../../../../../../../../../../../../../../../../../../../..]\n",
+    "[<node_0_name>:<proc_id>] MCW rank 1 bound to socket 1 ... [../../../../../../../../../../../../../../../../../../../..][BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB/BB]\n",
+    "```\n",
+    "\n",
+    "Rank 0 is bound to all cores on socket 0 on node 0 while rank 1 is bound to all cores on socket 1 on node 0. Clearly, this is not an optimal arrangement as halo exchanges have to cross socket boundaries for process. Now, check the process bindings in the previous case:\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24c54f2b-28a4-482d-9d53-4cea6aca1f00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && mpirun -np 16 --map-by ppr:4:socket --report-bindings ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c5791f2-169a-4394-a640-226b365b0ff8",
+   "metadata": {},
+   "source": [
+    "Now, ranks 0 and 1 are bound to the same socket in the same node. Moreover, ranks 3 and 4 are bound to different sockets (as `<procs_per_socket>` is 4) but bound to the same node, as desired.\n",
+    "\n",
+    "It is quite easy to end up in a sub-optimal process mapping by using simple OpenMPI flags and options. Thus, it is always advisible to double-check the process-to-core and process-to-socket bindings.  \n",
+    "\n",
+    "Moving forward, we will use the `--map-by ppr:4:socket` option as evidently it results in desired process-to-core, socket, and node mapping.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "We can profile an MPI program in two ways. To profile everything, putting the data in one file:\n",
+    "\n",
+    "```bash\n",
+    "nsys [nsys options] mpirun [mpi options] <program>\n",
+    "```\n",
+    "\n",
+    "To profile everything putting the data from each rank into a separate file:\n",
+    "\n",
+    "```bash\n",
+    "mpirun [mpi options] nsys profile [nsys options] <program>\n",
+    "```\n",
+    "\n",
+    "We will use the latter approach as it produces a single report and is more convenient to view. The host compute nodes need a working installation of Nsight Systems.\n",
+    "\n",
+    "Let's profile the application using `nsys`: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49c461fa-777d-47ab-94d9-e1ac418b9711",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd ../../source_code/mpi && nsys profile --trace=mpi,cuda,nvtx --stats=true --force-overwrite true -o jacobi_memcpy_mpi_report \\\n",
+    "                                 mpirun -np 16 --map-by ppr:4:socket ./jacobi_memcpy_mpi -ny 32768"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c89cc9bd-aed4-4ae4-bd5c-ae4698d44d92",
+   "metadata": {},
+   "source": [
+    "Download the report and view it via the GUI. \n",
+    "\n",
+    "You may notice that only 8 MPI processes are visible even though we launched 16 MPI processes. Nsight Systems displays the output from a single node and inter-node transactions (copy operations) are visible. This is for ease of viewing and doesn't impede our analysis.\n",
+    "\n",
+    "We share the partial output below:\n",
+    "\n",
+    "![mpi_memcpy_overview](../../images/mpi_memcpy_overview.png)\n",
+    "\n",
+    "Observe the following in the Timeline snapshot:\n",
+    "\n",
+    "* Two sets of halo exchanges take place, each consisting of DtoH and HtoD CUDA Memcpy with an `MPI_Sendrecv` call in between for inter-process communication followed by an `MPI_Allreduce` call. \n",
+    "* Each halo exchange takes about $45\\mu$s in hardware and about $60\\mu$s overall including the software overhead.\n",
+    "* The time between two Jacobi kernel iterations is about $200\\mu$s.\n",
+    "\n",
+    "However, if you scroll back in time, you might notice that not all halo exchanges take $60\\mu$s. For example, here's a snapshot from near the beginning of the multi-GPU Jacobi iteration loop:\n",
+    "\n",
+    "![mpi_memcpy_large_time](../../images/mpi_memcpy_large_time.png)\n",
+    "\n",
+    "Here, the halo exchange takes about $1100\\mu$s. MPI uses a lot of heuristics to fine-tune its call-stack and communication protocol to enhance performance. Therefore, we observe the behavior shown above where initially MPI calls take significant time but it improves in subsequent iterations.\n",
+    "\n",
+    "**Solution:** The solution for this exercise is present in `source_code/mpi/solutions` directory: [jacobi_memcpy_mpi.cpp](../../source_code/mpi/solutions/jacobi_memcpy_mpi.cpp).\n",
+    "\n",
+    "Note that our current implementation uses explicit host-staging for every halo copy operation. From our previous labs, we know that within a node, GPU-to-GPU communication can bypass host-staging and we implemented it using DtoD CUDA Memcpy with P2P enabled. Certainly, eliminating host-staging should improve performance. There are also inter-node communication optimizations that we can employ. \n",
+    "\n",
+    "We will learn more about both intra-node and inter-node GPU-centric MPI communication optimizations in the next lab where we will work with CUDA-aware MPI. Click below to move to the next lab:\n",
+    "\n",
+    "# [Next: CUDA-aware MPI](../mpi/cuda_aware.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Programming Concepts: MPI Point-to-Point Communication](https://cvw.cac.cornell.edu/mpip2p/p2pdef)\n",
+    "* [Programming Concepts: MPI Collective Communication](https://wgropp.cs.illinois.edu/courses/cs598-s15/lectures/lecture29.pdf)\n",
+    "* [Programming Concepts: NVIDIA Multi-Process Service](https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf)\n",
+    "* [Documentation: MPI Processing Mapping, Ranking, and Binding](https://www.open-mpi.org/doc/current/man1/mpirun.1.php#sect12)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 262 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/mpi/multi_node_intro.ipynb


+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/.gitkeep


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 342 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nccl/nccl.ipynb


+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nvhsmem/.gitkeep


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 461 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/nvshmem/nvshmem.ipynb


+ 271 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb

@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "39ad569e",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "The goal of this lab is to:\n",
+    "\n",
+    "* Review the scientific problem for which the Jacobi solver application has been developed.\n",
+    "* Understand the run the single-GPU code of the application.\n",
+    "* Learn about NVIDIA Nsight Systems profiler and how to use it to analyze our application.\n",
+    "\n",
+    "# The Application\n",
+    "\n",
+    "This section provides an overview of the scientific problem we focus on and the solver we employ. Then, we execute the single GPU version of the application program.\n",
+    "\n",
+    "### Laplace Equation\n",
+    "\n",
+    "Laplace Equation is a well-studied linear partial differential equation that governs steady state heat conduction, irrotational fluid flow, and many other phenomena. \n",
+    "\n",
+    "In this lab, we will consider the 2D Laplace Equation on a rectangle with Dirichlet boundary conditions on the left and right boundary and periodic boundary conditions on top and bottom boundary. We wish to solve the following equation:\n",
+    "\n",
+    "$\\Delta u(x,y) = 0\\;\\forall\\;(x,y)\\in\\Omega,\\delta\\Omega$\n",
+    "\n",
+    "### Jacobi Method\n",
+    "\n",
+    "The Jacobi method is an iterative algorithm to solve a linear system of strictly diagonally dominant equations. The governing Laplace equation is discretized and converted to a matrix amenable to Jacobi-method based solver.\n",
+    "\n",
+    "### The Code\n",
+    "\n",
+    "The GPU processing flow follows 3 key steps:\n",
+    "\n",
+    "1. Copy data from CPU to GPU\n",
+    "2. Launch GPU Kernel\n",
+    "3. Copy processed data back to CPU from GPU\n",
+    "\n",
+    "![gpu_programming_process](../../images/gpu_programming_process.png)\n",
+    "\n",
+    "Let's understand the single-GPU code first. \n",
+    "\n",
+    "The source code file, [jacobi.cu](../../source_code/single_gpu/jacobi.cu) (click to open), is present in `CFD/English/C/source_code/single_gpu/` directory. \n",
+    "\n",
+    "Alternatively, you can navigate to `CFD/English/C/source_code/single_gpu/` directory in Jupyter's file browser in the left pane. Then, click to open the `jacobi.cu` file as shown below:\n",
+    "\n",
+    "![jupyter_lab_navigation](../../images/jupyter_lab_navigation.png)\n",
+    "\n",
+    "Similarly, have look at the [Makefile](../../source_code/single_gpu/Makefile). \n",
+    "\n",
+    "Refer to the `single_gpu(...)` function. The important steps at each iteration of the Jacobi Solver (that is, the `while` loop) are:\n",
+    "1. The norm is set to 0 using `cudaMemset`.\n",
+    "2. The device kernel `jacobi_kernel` is called to update the interier points.\n",
+    "3. The norm is copied back to the host using `cudaMemcpy` (DtoH), and\n",
+    "4. The periodic boundary conditions are re-applied for the next iteration using `cudaMemcpy` (DtoD).\n",
+    "\n",
+    "Note that we run the Jacobi solver for 1000 iterations over the grid.\n",
+    "\n",
+    "### Compilation and Execution\n",
+    "\n",
+    "Let's first get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abb46488",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f97f825b",
+   "metadata": {},
+   "source": [
+    "We will now compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eac2daf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33345661",
+   "metadata": {},
+   "source": [
+    "Now, let us execute the program: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e234f430",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu && ./jacobi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14bb863e",
+   "metadata": {},
+   "source": [
+    "The output reports the norm value every 100 iterations and the total execution time of the Jacobi Solver. The expected output is:\n",
+    "\n",
+    "```\n",
+    "Single GPU jacobi relaxation: 1000 iterations on 16384 x 16384 mesh\n",
+    "    0, 31.999022\n",
+    "  100, 0.897983\n",
+    "  200, 0.535684\n",
+    "  300, 0.395651\n",
+    "  400, 0.319039\n",
+    "  500, 0.269961\n",
+    "  600, 0.235509\n",
+    "  700, 0.209829\n",
+    "  800, 0.189854\n",
+    "  900, 0.173818\n",
+    "16384x16384: 1 GPU:   4.4512 s\n",
+    "```\n",
+    "\n",
+    "The execution time may differ depending on the GPU, but the norm value after every 100 iterations should be the same. The program accepts `-nx` and `-ny` flags to change the grid size (preferably a power of 2) and `-niter` flag to change the number of iterations.\n",
+    "\n",
+    "\n",
+    "# Profiling\n",
+    "\n",
+    "While the program in our labs gives the execution time in its output, it may not always be convinient to time the execution from within the program. Moreover, just timing the execution does not reveal the bottlenecks directly. For that purpose, we profile the program with NVIDIA's NSight Systems profiler's command-line interface (CLI), `nsys`. \n",
+    "\n",
+    "### NVIDIA Nsight Systems\n",
+    "\n",
+    "Nsight Systems profiler offers system-wide performance analysis in order to visualize application’s execution timeline and help identify optimization opportunities on a system with multiple CPUs and GPUs.\n",
+    "\n",
+    "#### Timeline\n",
+    "\n",
+    "![Nsight Systems timeline](../../images/nsys_overview.png)\n",
+    "\n",
+    "The highlighted portions are identified as follows:\n",
+    "* <span style=\"color:red\">Red</span>: The CPU tab provides thread-level core utilization data. \n",
+    "* <span style=\"color:blue\">Blue</span>: The CUDA HW tab displays GPU kernel and memory transfer activities and API calls.\n",
+    "* <span style=\"color:orange\">Orange</span>: The Threads tab gives a detailed view of each CPU thread's activity including from OS runtime libraries, MPI, NVTX, etc.\n",
+    "\n",
+    "#### `nsys` CLI\n",
+    "\n",
+    "We will profile the application using `nsys` CLI. Here's a typical `nsys` command to profile a program:\n",
+    "\n",
+    "`nsys profile --trace=cuda,nvtx --stats=true -o jacobi_report --force-overwrite true ./jacobi`\n",
+    "\n",
+    "The `--trace` flag specifies that we want to trace CUDA and NVTX APIs (in addition to baseline tracing), `--stats` specifies that we want to generate a statistics summary after profiling, and `-o` allows us to name the report file (which will include the `.qdrep` extension). The `--force-overwrite` flag allows us to overwrite an existing report (of the same name).\n",
+    "\n",
+    "Note that we can always use the `nsys --help` to know more about these and other available options.\n",
+    "\n",
+    "### Viewing the Report\n",
+    "\n",
+    "One can view the profiling report by using Nsight Systems GUI. Note that CUDA toolkit and the GUI application of the same version as CLI are required. Follow these steps:\n",
+    "* Open Nsight Systems GUI application.\n",
+    "* Click on _file $\\rightarrow$ open_.\n",
+    "* Browse and select the `.qdrep` file.\n",
+    "\n",
+    "Alternatively, we can enable the `--stats` flag to display profiling data on the terminal (refer to the image below).\n",
+    "\n",
+    "![nsys cli sample output](../../images/nsys_cli_sample_output.png)\n",
+    "\n",
+    "### NVIDIA Tools Extension (NVTX)\n",
+    "\n",
+    "NVTX is C-based API for annotating events in applications. It is useful for profiling both specific events and large code blocks. We will routinely make use of NVTX APIs to instrument our application for `nsys` profiler. It helps `nsys` in collecting relevant information and improves the application timeline's readability. \n",
+    "\n",
+    "To use NVTX, follow these steps:\n",
+    "* `#include <nvToolsExt.h>` in the code file\n",
+    "* Insert `nvtxRangePush(\"myCodeBlock\");` just before the code block begins and `nvtxRangePop();` just after it ends.\n",
+    "\n",
+    "Now, go back to the [jacobi.cu](../../source_code/single_gpu/jacobi.cu) source code file and correlate the \"Jacobi solve\" annotated event visible on both the `nsys` CLI statistics and the GUI-based timeline to its use in the source code.\n",
+    "\n",
+    "### Improving performance\n",
+    "\n",
+    "Any code snippet can be taken up for optimizations. However, it is important to realize that our current code is limited to a single GPU. Usually a very powerful first optimization is to parallelize the code, which in our case means running it on multiple GPUs. Thus, we generally follow the cyclical process:\n",
+    "\n",
+    "* **Analyze** the code using profilers to identify bottlenecks and hotspots.\n",
+    "* **Parallelize** the routines where most of the time in the code is spent.\n",
+    "* **Optimize** the parallel code by analyzing first for opportunities, applying optimizations, verifying our gains, and repeating the process.\n",
+    "\n",
+    "### Metrics of Interest\n",
+    "\n",
+    "To quantify the performance gain, we denote the single-GPU execution time as $T_s$ and multi-GPU execution time for $P$ GPUs as $T_p$. Using this, we obtain the figures-of-merit:\n",
+    "* Speedup $S = T_s/T_p$ (optimal is $P$), and \n",
+    "* Efficiency $E = S/P$ (optimal is $1$). \n",
+    "\n",
+    "### Analyzing the code\n",
+    "\n",
+    "Let's profile the single-GPU code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a9a8109",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_report --force-overwrite true ./jacobi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6db3c3c7",
+   "metadata": {},
+   "source": [
+    "Now, download the report and view it via the GUI. This is the analysis step. Right click on the NVTX tab and select the Events View.\n",
+    "\n",
+    "![nsys single_gpu_analysis](../../images/nsys_single_gpu_analysis.png)\n",
+    "\n",
+    "Clearly, we need to parallelize the \"Jacobi Solve\" routine, which is essentially the iterative Jacobi solver loop. Click on the link to continue to the next lab where we parallelize the code using cudaMemcpy and understand concepts like Peer-to-Peer Memory Access.\n",
+    "\n",
+    "# [Next: CUDA Memcpy and Peer-to-Peer Memory Access](../cuda/memcpy.ipynb)\n",
+    "\n",
+    "Here's a link to the home notebook through which all other notebooks are accessible:\n",
+    "\n",
+    "# [HOME](../../../introduction.ipynb)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Links and Resources\n",
+    "\n",
+    "* [Science: Laplace Equation](https://mathworld.wolfram.com/LaplacesEquation.html)\n",
+    "* [Science: Jacobi Method](https://en.wikipedia.org/wiki/Jacobi_method)\n",
+    "* [Programming: CUDA C/C++ Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)\n",
+    "* [Programming: NVTX Documentation](https://docs.nvidia.com/nsight-visual-studio-edition/2020.1/nvtx/index.html)\n",
+    "* [Tools: NVIDIA NSight Systems profiler](https://developer.nvidia.com/nsight-systems)\n",
+    "* [Code: Multi-GPU Programming Models](https://github.com/NVIDIA/multi-gpu-programming-models)\n",
+    "* [Code: GPU Bootcamp](https://github.com/gpuhackathons-org/gpubootcamp/)\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 23 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/Makefile

@@ -0,0 +1,23 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NVCC=nvcc
+CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14
+
+jacobi_memcpy: jacobi_memcpy.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_memcpy.cu -o jacobi_memcpy
+
+jacobi_streams: jacobi_streams.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_streams.cu -o jacobi_streams
+
+jacobi_streams_events: jacobi_streams_events.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_streams_events.cu -o jacobi_streams_events
+
+all: jacobi_memcpy jacobi_streams jacobi_streams_events
+
+.PHONY: clean
+clean:
+	rm -f jacobi_memcpy jacobi_streams jacobi_streams_events *.qdrep *.sqlite
+

+ 456 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_memcpy.cu

@@ -0,0 +1,456 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+int get_parsed_vals(char** begin, char **end, int* devices,
+		const std::string& arg, const int default_val) {
+    int numGPUs = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        numGPUs = 0;
+        std::string dev_ids(*itr);
+	int currpos = 0, nextpos = 0;
+	do {
+	    nextpos = dev_ids.find_first_of(",", currpos);
+            devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
+	    numGPUs++;
+	    currpos = nextpos + 1;
+        } while (nextpos != std::string::npos);
+    }
+    else {
+        for (int i = 0; i < numGPUs; i++) {
+            devices[i] = i;
+	}
+    }
+    return numGPUs;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+    
+    // Get GPU mapping from runtime arguments
+    int available_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
+    int devices[MAX_NUM_DEVICES];
+    int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+	        // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+	    // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
+            if (canAccessPeer) {
+            // TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]"
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                // TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
+                // "devices[bottom]", whenever possible
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+	    // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: Part 1- Set current GPU to be "devices[dev_id]"
+            CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
+
+            CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+	    
+            // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
+            // blocks per thread. "dev_id" variable points to corresponding memory allocated 
+            // for the current GPU.
+            jacobi_kernel<<</*Fill me*/, /*Fill me*/>>>(/*Fill me*/);
+
+            // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float), /*Fill me*/));
+	}
+    // Launch async memory copy operations for halo exchange and 
+	// for copying local-grid L2 norm from each GPU to host
+	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            
+            // TODO: Part 1- Set current GPU
+            CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
+
+            // TODO: Part 1- Implement halo exchange with top neighbour "top"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
+	    
+            // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
+        }
+        l2_norm = 0.0;
+        // Synchronize devices and compute global L2 norm
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
+	        CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
+            CUDA_RT_CALL(/*Fill me*/);
+
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d. Using GPU ID: ", num_devices);
+	for (int i = 0; i < num_devices; i++) {
+            printf("%d, ", devices[i]);
+	}
+        printf(
+	    "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+       // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+    	iter++;
+    	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 451 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams.cu

@@ -0,0 +1,451 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    // Declare compute and halo exchange streams
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        // Create streams
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            // TODO: Part 3- Ensure "top" and "bottom" neighbours have shared updated halos
+            // from the previous iteration by synchronizing "push_top_stream" and
+            // "push_bottom_stream" streams. Be careful with which neighbour's top stream and
+            // which neighbour's bottom stream needs to be synchronized.
+            CUDA_RT_CALL(cudaStreamSynchronize(/*Fill me*/));
+            CUDA_RT_CALL(cudaStreamSynchronize(/*Fill me*/));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            // TODO: Part 3- Launch Jacobi kernel on "compute_stream[dev_id]" and all other
+            // functional arguments
+            jacobi_kernel<<</*Fill me*/, /*Fill me*/, 0, /*Fill me*/>>>(/*Fill me*/);
+
+            // TODO: Part 3- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h" on
+            // "compute_stream[dev_id]"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float),
+                                            /*Fill me*/, /*Fill me*/));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // TODO: Part 3- Before copying the updated halos to neighbours, ensure the 
+            // computation is complete by synchronizing "compute_stream[dev_id]" stream
+            CUDA_RT_CALL(cudaStreamSynchronize(/*Fill me*/));
+
+            // Apply periodic boundary conditions
+            // TODO: Part 3- Implement halo exchange with top neighbour on current device's 
+            // "push_top_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float),
+                                         /*Fill me*/, /*Fill me*/));
+
+            // TODO: Part 3- Implement halo exchange with "bottom" neighbour on current device's 
+            // "push_bottom_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float),
+                                         /*Fill me*/, /*Fill me*/));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 470 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/jacobi_streams_events.cu

@@ -0,0 +1,470 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+    cudaEvent_t compute_done[MAX_NUM_DEVICES];
+    cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
+    cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+    if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(compute_done + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[0] + dev_id));
+        CUDA_RT_CALL(
+            cudaEventCreate(push_bottom_done[0] + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[1] + dev_id));
+        CUDA_RT_CALL(
+            cudaEventCreate(push_bottom_done[1] + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            // TODO: Part 4- Block the "compute_stream" as long as the top and bottom halos from the
+            // neighbours are not copied to "dev_id". The "push_top_done" and "push_bottom_done" 
+            // events are to monitored for "bottom" and "top" neighbours, respectively for the 
+            // previous iteration denoted by "iter % 2".
+            CUDA_RT_CALL(cudaStreamWaitEvent(/*Fill me*/, /*Fill me*/, 0));
+            CUDA_RT_CALL(cudaStreamWaitEvent(/*Fill me*/, /*Fill me*/, 0));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 4- Record that Jacobi computation on "compute_stream" is done by using
+            // cudaEventRecord for "compute_done" event for "dev_id"
+            CUDA_RT_CALL(cudaEventRecord(/*Fill me*/, /*Fill me*/));
+
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float),
+                                            /*Fill me*/, /*Fill me*/));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // Apply periodic boundary conditions
+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
+            // "compute_done" event on "push_top_stream" so that the top halo isn't copied to the
+            // neighbour before computation is done
+            CUDA_RT_CALL(cudaStreamWaitEvent(/*Fill me*/, /*Fill me*/));
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float),
+                                         /*Fill me*/, /*Fill me*/));
+
+            // TODO: Part 4- Record completion of top halo copy from "dev_id" to its neighbour
+            // to be used in next iteration. Record the event for "push_top_done" stream of 
+            // "dev_id" for next iteration which is "(iter+1) % 2"
+            CUDA_RT_CALL(cudaEventRecord(/*Fill me*/, /*Fill me*/));
+
+            // TODO: Part 4- Wait for the Jacobi computation of "dev_id" to complete by using the
+            // "compute_done" event on "push_bottom_stream" so that the bottom halo isn't copied to
+            // the neighbour before computation is done
+            CUDA_RT_CALL(cudaStreamWaitEvent(/*Fill me*/, /*Fill me*/, 0));
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float),
+                                         /*Fill me*/, /*Fill me*/));
+                                         
+            // TODO: Part 4- Record completion of bottom halo copy from "dev_id" to its neighbour
+            // to be used in next iteration. Record the event for "push_bottom_done" stream of 
+            // "dev_id" for next iteration which is "(iter+1) % 2"
+            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float),
+                                         /*Fill me*/, /*Fill me*/));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(compute_done[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 463 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_memcpy.cu

@@ -0,0 +1,463 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+int get_parsed_vals(char** begin, char **end, int* devices,
+		const std::string& arg, const int default_val) {
+    int numGPUs = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        numGPUs = 0;
+        std::string dev_ids(*itr);
+	int currpos = 0, nextpos = 0;
+	do {
+	    nextpos = dev_ids.find_first_of(",", currpos);
+            devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
+	    numGPUs++;
+	    currpos = nextpos + 1;
+        } while (nextpos != std::string::npos);
+    }
+    else {
+        for (int i = 0; i < numGPUs; i++) {
+            devices[i] = i;
+	}
+    }
+    return numGPUs;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+    
+    // Get GPU mapping from runtime arguments
+    int available_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
+    int devices[MAX_NUM_DEVICES];
+    int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+	        // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+	    // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+	    // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, devices[dev_id], devices[top]));
+            if (canAccessPeer) {
+		// TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]"
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[top], 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+		// TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
+		// "devices[bottom]", whenever possible
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, 
+					devices[dev_id], devices[bottom]));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[bottom], 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+	    // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: Part 1- Set current GPU to be "devices[dev_id]"
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+            CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+	    
+            // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
+            // blocks per thread. "dev_id" variable points to corresponding memory allocated 
+            // for the current GPU.
+            jacobi_kernel<<<dim_grid, dim_block>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost));
+	}
+    // Launch async memory copy operations for halo exchange and 
+	// for copying local-grid L2 norm from each GPU to host
+	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            
+            // TODO: Part 1- Set current GPU
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+            // TODO: Part 1- Implement halo exchange with top neighbour "top"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice));
+	    
+            // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice));
+        }
+        l2_norm = 0.0;
+        // Synchronize devices and compute global L2 norm
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
+	        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+            CUDA_RT_CALL(cudaDeviceSynchronize());
+
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d. Using GPU ID: ", num_devices);
+	for (int i = 0; i < num_devices; i++) {
+            printf("%d, ", devices[i]);
+	}
+        printf(
+	    "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+       // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+    	iter++;
+    	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 455 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams.cu

@@ -0,0 +1,455 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    // Declare compute and halo exchange streams
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+        if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        // Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        // Create streams
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        // Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            // TODO: Part 3- Ensure "top" and "bottom" neighbours have shared updated halos
+            // from the previous iteration by synchronizing "push_top_stream" and
+            // "push_bottom_stream" streams. Be careful with which neighbour's top stream and
+            // which neighbour's bottom stream needs to be synchronized.
+            CUDA_RT_CALL(cudaStreamSynchronize(push_top_stream[bottom]));
+            CUDA_RT_CALL(cudaStreamSynchronize(push_bottom_stream[top]));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            // TODO: Part 3- Launch Jacobi kernel on "compute_stream[dev_id]" and all other
+            // functional arguments
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            // TODO: Part 3- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h" on
+            // "compute_stream[dev_id]"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // TODO: Part 3- Before copying the updated halos to neighbours, ensure the 
+            // computation is complete by synchronizing "compute_stream[dev_id]" stream
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+
+            // Apply periodic boundary conditions
+            // TODO: Part 3- Implement halo exchange with top neighbour on current device's 
+            // "push_top_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+
+            // TODO: Part 3- Implement halo exchange with "bottom" neighbour on current device's 
+            // "push_bottom_stream"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
+                                         push_bottom_stream[dev_id]));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 455 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/cuda/solutions/jacobi_streams_events.cu

@@ -0,0 +1,455 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+    cudaEvent_t compute_done[MAX_NUM_DEVICES];
+    cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
+    cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+    if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(compute_done + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[0] + dev_id));
+        CUDA_RT_CALL(
+            cudaEventCreate(push_bottom_done[0] + dev_id));
+        CUDA_RT_CALL(cudaEventCreate(push_top_done[1] + dev_id));
+        CUDA_RT_CALL(
+            cudaEventCreate(push_bottom_done[1] + dev_id));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            CUDA_RT_CALL(
+                cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
+            CUDA_RT_CALL(
+                cudaStreamWaitEvent(compute_stream[dev_id], push_bottom_done[(iter % 2)][top], 0));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+
+            CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
+
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+        }    
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            // Apply periodic boundary conditions
+            CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+            CUDA_RT_CALL(
+                cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
+
+            CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
+                                         push_bottom_stream[dev_id]));
+            CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
+                                         push_bottom_stream[dev_id]));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(compute_done[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/.gitkeep


+ 30 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/Makefile

@@ -0,0 +1,30 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NVCC=nvcc
+MPICXX=mpicxx
+#CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+
+NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
+MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
+LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
+
+hello_world: Makefile hello_world.c
+	$(MPICXX) $(MPICXX_FLAGS) hello_world.c $(LD_FLAGS) -o hello_world
+
+jacobi_memcpy_mpi: Makefile jacobi_memcpy_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_memcpy_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_memcpy_mpi
+
+jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
+
+jacobi_kernels.o: Makefile jacobi_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
+
+all: hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi
+
+.PHONY.: clean
+clean:
+	rm -rf hello_world jacobi_memcpy_mpi jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
+

+ 24 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/Makefile

@@ -0,0 +1,24 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NVCC=nvcc
+MPICXX=mpicxx
+#CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+
+NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
+MPICXX_FLAGS = -g -I$(CUDA_HOME)/include  -fopenmp -std=c++14
+LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
+
+jacobi_cuda_aware_mpi: Makefile jacobi_cuda_aware_mpi.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_cuda_aware_mpi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_cuda_aware_mpi
+
+jacobi_kernels.o: Makefile jacobi_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
+
+all: jacobi_cuda_aware_mpi
+
+.PHONY.: clean
+clean:
+	rm -rf jacobi_cuda_aware_mpi *.o *.qdrep *.sqlite
+

+ 358 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,358 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_FLOAT, top, 0,
+                              a_new + (iy_end * nx), nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_FLOAT, bottom, 0, a_new, nx,
+                              MPI_FLOAT, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 97 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/containerization/jacobi_kernels.cu

@@ -0,0 +1,97 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cstdio>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny){
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+}
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                  ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
+}

+ 27 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/hello_world.c

@@ -0,0 +1,27 @@
+#include <mpi.h>
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+    // Initialize the MPI environment
+    MPI_Init(NULL, NULL);
+
+    // Get the number of processes
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Get the rank of the process
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // Get the name of the processor
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+
+    // Print a hello world message
+    printf("Hello world from processor %s, rank %d out of %d processors\n",
+           processor_name, rank, size);
+
+    // Finalize the MPI environment.
+    MPI_Finalize();
+}

+ 361 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,361 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- Implement top halo exchange. Use only GPU buffers in the MPI call's 
+        // function arguments.
+        MPI_CALL(MPI_Sendrecv(/*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0,
+                              /*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- Implement bottom halo exchange. Use only GPU buffers in the MPI call's 
+        // function arguments.
+        MPI_CALL(MPI_Sendrecv(/*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, 
+                                /*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, MPI_COMM_WORLD, 
+                                MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- Reduce the rank-local L2 Norm to a global L2 norm
+        MPI_CALL(MPI_Allreduce(/*Fill me*/, /*Fill me*/, 1, MPI_FLOAT, /*Fill me*/, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 97 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_kernels.cu

@@ -0,0 +1,97 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cstdio>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny){
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+}
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                  ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
+}

+ 375 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/jacobi_memcpy_mpi.cpp

@@ -0,0 +1,375 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    // TODO: Part 1- Obtain the node-level local rank by splitting the global communicator
+    // Free the local communicator after its use
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(/*Fill me*/));
+
+    MPI_CALL(MPI_Comm_rank(/*Fill me*/));
+
+    MPI_CALL(MPI_Comm_free(/*Fill me*/));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    float* top_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&top_halo_buf, nx * sizeof(float)));
+    float* bot_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&bot_halo_buf, nx * sizeof(float)));
+
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // First set of halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(top_halo_buf, a_new + (iy_start * nx), nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        // TODO: Part 1- Implement the first set of halo exchanges using MPI_SendRecv explained 
+        // in the Jupyter Notebook. Observe the Memcpy operations above and below this comment
+        MPI_CALL(MPI_Sendrecv(/*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0,
+                              /*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new + (iy_end * nx), bot_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // Second set of halo exchanges
+        // TODO: Part 1- Implement the Memcpy operations and MPI calls for the second set of
+        // halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
+        MPI_CALL(MPI_Sendrecv(/*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, 
+                                /*Fill me*/, nx, MPI_FLOAT, /*Fill me*/, 0, MPI_COMM_WORLD, 
+                                MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
+        nvtxRangePop();                        
+
+        // TODO: Part 1- Reduce the rank-local L2 Norm to a global L2 norm using MPI_Allreduce
+        MPI_CALL(MPI_Allreduce(/*Fill me*/, /*Fill me*/, 1, MPI_FLOAT, /*Fill me*/, MPI_COMM_WORLD));
+        
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 358 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_cuda_aware_mpi.cpp

@@ -0,0 +1,358 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_FLOAT, top, 0,
+                              a_new + (iy_end * nx), nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        nvtxRangePush("Halo exchange CUDA-aware MPI");
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_FLOAT, bottom, 0, a_new, nx,
+                              MPI_FLOAT, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+        nvtxRangePop(); 
+
+        // TODO: Part 2- 
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        l2_norm = std::sqrt(l2_norm);
+        
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 378 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/mpi/solutions/jacobi_memcpy_mpi.cpp

@@ -0,0 +1,378 @@
+/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                  const float pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                          float* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+    int num_devices = 0;
+    cudaGetDeviceCount(&num_devices);
+    
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    // TODO: Part 1- Obtain the node-level local rank by splitting the global communicator
+    // Make sure ot free the local communicator after its use
+    MPI_Comm local_comm;
+    MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                    &local_comm));
+
+    MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+    MPI_CALL(MPI_Comm_free(&local_comm));
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank % num_devices));
+    CUDA_RT_CALL(cudaFree(0));
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    float* top_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&top_halo_buf, nx * sizeof(float)));
+    float* bot_halo_buf;
+    CUDA_RT_CALL(cudaMallocHost(&bot_halo_buf, nx * sizeof(float)));
+
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+	    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // First set of halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(top_halo_buf, a_new + (iy_start * nx), nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        // TODO: Part 1- Implement the first set of halo exchanges using MPI_SendRecv explained 
+        // in the Jupyter Notebook. Observe the Memcpy operations above and below this comment
+        MPI_CALL(MPI_Sendrecv(top_halo_buf, nx, MPI_FLOAT, top, 0,
+                              bot_halo_buf, nx, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD,
+                              MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new + (iy_end * nx), bot_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        nvtxRangePush("Halo exchange Memcpy+MPI");
+        // Second set of halo exchanges
+        // TODO: Part 1- Implement the Memcpy operations and MPI calls for the second set of
+        // halo exchanges
+        CUDA_RT_CALL(cudaMemcpy(bot_halo_buf, a_new + (iy_end - 1) * nx, nx * sizeof(float), 
+                                cudaMemcpyDeviceToHost));
+        MPI_CALL(MPI_Sendrecv(bot_halo_buf, nx, MPI_FLOAT, bottom, 0, 
+                                top_halo_buf, nx, MPI_FLOAT, top, 0, MPI_COMM_WORLD, 
+                                MPI_STATUS_IGNORE));
+        CUDA_RT_CALL(cudaMemcpy(a_new, top_halo_buf, nx * sizeof(float), 
+                                cudaMemcpyHostToDevice));
+        nvtxRangePop();                        
+
+        // TODO: Part 1- Reduce the rank-local L2 Norm to a global L2 norm using MPI_Allreduce
+        MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+        
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/.gitkeep


+ 24 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/Makefile

@@ -0,0 +1,24 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+NVCC=nvcc
+MPICXX=mpicxx
+MPIRUN ?= mpirun
+#CUDA_HOME ?= /usr/local/cuda
+#NCCL_HOME ?= /usr/nccl/
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+
+NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14
+MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -fopenmp -std=c++14
+LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl
+
+jacobi_nccl: Makefile jacobi_nccl.cpp jacobi_kernels.o
+	$(MPICXX) $(MPICXX_FLAGS) jacobi_nccl.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi_nccl
+
+jacobi_kernels.o: Makefile jacobi_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c
+
+.PHONY.: clean
+clean:
+	rm -rf jacobi_nccl jacobi_kernels.o *.qdrep *.sqlite
+

+ 488 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi.cpp

@@ -0,0 +1,488 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <mpi.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#include <cuda_runtime.h>
+
+#ifdef USE_NVTX
+#include <nvToolsExt.h>
+
+const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
+                           0x0000ffff, 0x00ff0000, 0x00ffffff};
+const int num_colors = sizeof(colors) / sizeof(uint32_t);
+
+#define PUSH_RANGE(name, cid)                              \
+    {                                                      \
+        int color_id = cid;                                \
+        color_id = color_id % num_colors;                  \
+        nvtxEventAttributes_t eventAttrib = {0};           \
+        eventAttrib.version = NVTX_VERSION;                \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
+        eventAttrib.colorType = NVTX_COLOR_ARGB;           \
+        eventAttrib.color = colors[color_id];              \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+        eventAttrib.message.ascii = name;                  \
+        nvtxRangePushEx(&eventAttrib);                     \
+    }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)
+#define POP_RANGE
+#endif
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+#include <nccl.h>
+
+#define NCCL_CALL(call)                                                                     \
+    {                                                                                       \
+        ncclResult_t  ncclStatus = call;                                                    \
+        if (ncclSuccess != ncclStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: NCCL call \"%s\" in line %d of file %s failed "                 \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \
+    }
+
+#ifdef USE_DOUBLE
+typedef double real;
+#define MPI_REAL_TYPE MPI_DOUBLE
+#define NCCL_REAL_TYPE ncclDouble
+#else
+typedef float real;
+#define MPI_REAL_TYPE MPI_FLOAT
+#define NCCL_REAL_TYPE ncclFloat
+#endif
+
+constexpr real tol = 1.0e-8;
+
+const real PI = 2.0 * std::asin(1.0);
+
+void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
+                                  const real pi, const int offset, const int nx, const int my_ny,
+                                  const int ny);
+
+void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
+                          real* __restrict__ const l2_norm, const int iy_start, const int iy_end,
+                          const int nx, const bool calculate_norm, cudaStream_t stream);
+
+double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
+                  const int nccheck, const bool print);
+
+template <typename T>
+T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
+    T argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    ncclUniqueId nccl_uid;
+    if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
+    MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD));
+
+    const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
+    const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
+    const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
+    const bool csv = get_arg(argv, argv + argc, "-csv");
+
+    int local_rank = -1;
+    {
+        MPI_Comm local_comm;
+        MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                     &local_comm));
+
+        MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+        MPI_CALL(MPI_Comm_free(&local_comm));
+    }
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaFree(0));
+
+    ncclComm_t nccl_comm;
+    NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank));
+    int nccl_version = 0;
+    NCCL_CALL(ncclGetVersion(&nccl_version));
+    if ( nccl_version < 2800 ) {
+        fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n");
+        NCCL_CALL(ncclCommDestroy(nccl_comm));
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+
+    real* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
+    real* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
+    double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
+
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    real* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
+    real* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    cudaStream_t compute_stream;
+    CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
+    cudaEvent_t compute_done;
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
+
+    real* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
+    real* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
+
+    PUSH_RANGE("NCCL_Warmup", 5)
+    for (int i = 0; i < 10; ++i) {
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+        NCCL_CALL(ncclGroupStart());
+        NCCL_CALL(ncclRecv(a_new,                     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclGroupEnd());
+        CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
+    }
+    POP_RANGE
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (!csv && 0 == rank) {
+        printf(
+            "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
+            "every %d iterations\n",
+            iter_max, nx, ny, nccheck);
+    }
+
+    int iter = 0;
+    real l2_norm = 1.0;
+    bool calculate_norm;  // boolean to store whether l2 norm will be calculated in
+                          //   an iteration or not
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    PUSH_RANGE("Jacobi solve", 0)
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
+
+        calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
+                             compute_stream);
+        CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
+
+        if (calculate_norm) {
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
+                                         compute_stream));
+        }
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // Apply periodic boundary conditions
+        PUSH_RANGE("NCCL_LAUNCH", 5)
+        NCCL_CALL(ncclGroupStart());
+        NCCL_CALL(ncclRecv(a_new,                     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream));
+        NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, NCCL_REAL_TYPE, top,    nccl_comm, compute_stream));
+        NCCL_CALL(ncclGroupEnd());
+        POP_RANGE
+
+        if (calculate_norm) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
+            MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD));
+            l2_norm = std::sqrt(l2_norm);
+
+            if (!csv && 0 == rank && (iter % 100) == 0) {
+                printf("%5d, %0.6f\n", iter, l2_norm);
+            }
+        }
+
+        std::swap(a_new, a);
+        iter++;
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    double stop = MPI_Wtime();
+    POP_RANGE
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+
+    int global_result_correct = 1;
+    MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                           MPI_COMM_WORLD));
+    result_correct = global_result_correct;
+
+    if (rank == 0 && result_correct) {
+        if (csv) {
+            printf("nccl, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
+                   (stop - start), runtime_serial);
+        } else {
+            printf("Num GPUs: %d.\n", size);
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+    }
+    CUDA_RT_CALL(cudaEventDestroy(compute_done));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    NCCL_CALL(ncclCommDestroy(nccl_comm));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
+                  const int nccheck, const bool print) {
+    real* a;
+    real* a_new;
+
+    cudaStream_t compute_stream;
+    cudaStream_t push_top_stream;
+    cudaStream_t push_bottom_stream;
+    cudaEvent_t compute_done;
+    cudaEvent_t push_top_done;
+    cudaEvent_t push_bottom_done;
+
+    real* l2_norm_d;
+    real* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    CUDA_RT_CALL(cudaStreamCreate(&compute_stream));
+    CUDA_RT_CALL(cudaStreamCreate(&push_top_stream));
+    CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming));
+    CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming));
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print)
+        printf(
+            "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
+            "norm "
+            "check every %d iterations\n",
+            iter_max, nx, ny, nccheck);
+
+    int iter = 0;
+    real l2_norm = 1.0;
+    bool calculate_norm;
+
+    double start = MPI_Wtime();
+    PUSH_RANGE("Jacobi solve", 0)
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream));
+
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0));
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0));
+
+        calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0;
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm,
+                             compute_stream);
+        CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream));
+
+        if (calculate_norm) {
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
+                                         compute_stream));
+        }
+
+        // Apply periodic boundary conditions
+
+        CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0));
+        CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
+                                     cudaMemcpyDeviceToDevice, push_top_stream));
+        CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream));
+
+        CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0));
+        CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
+                                     cudaMemcpyDeviceToDevice, compute_stream));
+        CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream));
+
+        if (calculate_norm) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
+            l2_norm = *l2_norm_h;
+            l2_norm = std::sqrt(l2_norm);
+            if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+        iter++;
+    }
+    POP_RANGE
+    double stop = MPI_Wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaEventDestroy(push_bottom_done));
+    CUDA_RT_CALL(cudaEventDestroy(push_top_done));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done));
+    CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(push_top_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 98 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_kernels.cu

@@ -0,0 +1,98 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cstdio>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny) {
+    initialize_boundaries<<<my_ny / 128 + 1, 128>>>(a_new, a, pi, offset, nx, my_ny, ny);
+}
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx, cudaStream_t stream) {
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                  ((iy_end - iy_start) + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    jacobi_kernel<<<dim_grid, dim_block, 0, stream>>>(a_new, a, l2_norm, iy_start, iy_end, nx);
+}
+

+ 406 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/jacobi_nccl.cpp

@@ -0,0 +1,406 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+#include <nccl.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+#define NCCL_CALL(call)                                                                     \
+    {                                                                                       \
+        ncclResult_t  ncclStatus = call;                                                    \
+        if (ncclSuccess != ncclStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: NCCL call \"%s\" in line %d of file %s failed "                 \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx, cudaStream_t stream);
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    ncclUniqueId nccl_uid;
+    if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
+    MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD));
+
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    {
+        MPI_Comm local_comm;
+        MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                     &local_comm));
+
+        MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+        MPI_CALL(MPI_Comm_free(&local_comm));
+    }
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaFree(0));
+
+    ncclComm_t nccl_comm;
+    NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank));
+
+
+    int nccl_version = 0;
+    NCCL_CALL(ncclGetVersion(&nccl_version));
+    if ( nccl_version < 2800 ) {
+        fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n");
+        NCCL_CALL(ncclCommDestroy(nccl_comm));
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_global_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_global_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, 0);
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // TODO: Reduce the device-local L2 norm, "l2_norm_d" to the global L2 norm on each device,
+        // "l2_global_norm_d", using ncclAllReduce() function. Use "ncclSum" as the reduction operation.
+        // Make sure to encapsulate this funciton call within NCCL group calls.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(/*Fill me*/);
+        NCCL_CALL(ncclAllReduce(/*Fill me*/, /*Fill me*/, 1, ncclFloat, /*Fill me*/, nccl_comm, 0));
+        NCCL_CALL(/*Fill me*/);
+
+        // TODO: Transfer the global L2 norm from each device to the host using cudaMemcpyAsync
+        CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float), /*Fill me*/));
+
+        // Apply periodic boundary conditions
+        NCCL_CALL(ncclGroupStart());
+        
+        //TODO: Perform the first set of halo exchanges by:
+        // 1. Receiving the top halo from the "top" neighbour into the "a_new" device memory array location. 
+        // 2. Sending current device's bottom halo to "bottom" neighbour from the "a_new + (iy_end - 1) * nx"
+        //    device memory array location.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(ncclRecv(/*Fill me*/, nx, ncclFloat, /*Fill me*/, nccl_comm, 0));
+        NCCL_CALL(ncclSend(/*Fill me*/, nx, ncclFloat, /*Fill me*/, nccl_comm, 0));
+
+        //TODO: Perform the second set of halo exchanges by:
+        // 1. Receiving the bottom halo from the "bottom" neighbour into the "a_new + (iy_end * nx)" 
+        //    device memory array location. 
+        // 2. Sending current device's top halo to "top" neighbour from the "a_new + iy_start * nx"
+        //    device memory array location.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(ncclRecv(/*Fill me*/, nx, ncclFloat, /*Fill me*/, nccl_comm, 0));
+        NCCL_CALL(ncclSend(/*Fill me*/, nx, ncclFloat, /*Fill me*/, nccl_comm, 0));
+
+        NCCL_CALL(ncclGroupEnd());
+
+        // TODO: Synchronize the device before computing the global L2 norm on host for printing
+        CUDA_RT_CALL(/*Fill me*/);
+
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    NCCL_CALL(ncclCommDestroy(nccl_comm));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, 0);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 407 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nccl/solution/jacobi_nccl.cpp

@@ -0,0 +1,407 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+#include <mpi.h>
+#include <omp.h>
+#include <nccl.h>
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+#define NCCL_CALL(call)                                                                     \
+    {                                                                                       \
+        ncclResult_t  ncclStatus = call;                                                    \
+        if (ncclSuccess != ncclStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: NCCL call \"%s\" in line %d of file %s failed "                 \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \
+    }
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+void launch_jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx, cudaStream_t stream);
+
+void launch_initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                                    const int nx, const int my_ny, const int ny);
+
+double single_gpu(const int nx, const int ny, const int iter_max, 
+                    float* const a_ref_h, bool print);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+int main(int argc, char* argv[]) {
+    MPI_CALL(MPI_Init(&argc, &argv));
+    int rank;
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    int size;
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    ncclUniqueId nccl_uid;
+    if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid));
+    MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD));
+
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    int local_rank = -1;
+    {
+        MPI_Comm local_comm;
+        MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                                     &local_comm));
+
+        MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+
+        MPI_CALL(MPI_Comm_free(&local_comm));
+    }
+
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaFree(0));
+
+    ncclComm_t nccl_comm;
+    NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank));
+
+
+    int nccl_version = 0;
+    NCCL_CALL(ncclGetVersion(&nccl_version));
+    if ( nccl_version < 2800 ) {
+        fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n");
+        NCCL_CALL(ncclCommDestroy(nccl_comm));
+        MPI_CALL(MPI_Finalize());
+        return 1;
+    }
+
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    float* a_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    
+    double runtime_serial = 1;
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, rank == 0);
+    }
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / size;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = size * chunk_size_low + size -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (rank < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    float* a;
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(float)));
+    float* a_new;
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float)));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (rank < num_ranks_low) {
+        iy_start_global = rank * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+
+    int iy_start = 1;
+    int iy_end = iy_start + chunk_size;
+
+    // Set diriclet boundary conditions on left and right boarder
+    launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny);
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    float* l2_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    float* l2_global_norm_d;
+    CUDA_RT_CALL(cudaMalloc(&l2_global_norm_d, sizeof(float)));
+    float* l2_norm_h;
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (0 == rank) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(float)));
+
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, 0);
+
+        const int top = rank > 0 ? rank - 1 : (size - 1);
+        const int bottom = (rank + 1) % size;
+
+        // TODO: Reduce the device-local L2 norm, "l2_norm_d" to the global L2 norm on each device,
+        // "l2_global_norm_d", using ncclAllReduce() function. Use "ncclSum" as the reduction operation.
+        // Make sure to encapsulate this funciton call within NCCL group calls.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(ncclGroupStart());
+        NCCL_CALL(ncclAllReduce(l2_norm_d, l2_global_norm_d, 1, ncclFloat, ncclSum, nccl_comm, 
+                                    0));
+        NCCL_CALL(ncclGroupEnd());
+
+        // TODO: Transfer the global L2 norm from each device to the host using cudaMemcpyAsync
+        CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_global_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        NCCL_CALL(ncclGroupStart());
+        
+        //TODO: Perform the first set of halo exchanges by:
+        // 1. Receiving the top halo from the "top" neighbour into the "a_new" device memory array location. 
+        // 2. Sending current device's bottom halo to "bottom" neighbour from the "a_new + (iy_end - 1) * nx"
+        //    device memory array location.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(ncclRecv(a_new,                     nx, ncclFloat, top,    nccl_comm, 0));
+        NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, ncclFloat, bottom, nccl_comm, 0));
+
+        //TODO: Perform the second set of halo exchanges by:
+        // 1. Receiving the bottom halo from the "bottom" neighbour into the "a_new + (iy_end * nx)" 
+        //    device memory array location. 
+        // 2. Sending current device's top halo to "top" neighbour from the "a_new + iy_start * nx"
+        //    device memory array location.
+        // Use "0" in the stream parameter function argument.
+        NCCL_CALL(ncclRecv(a_new + (iy_end * nx),     nx, ncclFloat, bottom, nccl_comm, 0));
+        NCCL_CALL(ncclSend(a_new + iy_start * nx,     nx, ncclFloat, top,    nccl_comm, 0));
+
+        NCCL_CALL(ncclGroupEnd());
+
+        // TODO: Synchronize the device before computing the global L2 norm on host for printing
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if (0 == rank && (iter % 100) == 0) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    int result_correct = 1;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = 0;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                            MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (rank == 0 && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    NCCL_CALL(ncclCommDestroy(nccl_comm));
+
+    MPI_CALL(MPI_Finalize());
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h, bool print) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print) {
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, 0);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0 && print) {
+            printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/.gitkeep


+ 29 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/Makefile

@@ -0,0 +1,29 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+NP ?= 1
+NVCC=nvcc
+MPIRUN ?= mpirun
+CUDA_HOME ?= /usr/local/cuda
+ifndef NVSHMEM_HOME
+$(error NVSHMEM_HOME is not set)
+endif
+ifndef MPI_HOME
+$(error MPI_HOME is not set)
+endif
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+
+NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include
+NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt
+
+left_shift: Makefile left_shift.cu
+	$(NVCC) $(NVCC_FLAGS) left_shift.cu -c -o left_shift.o
+	$(NVCC) $(GENCODE_FLAGS) left_shift.o -o left_shift $(NVCC_LDFLAGS)
+
+jacobi_nvshmem: Makefile jacobi_nvshmem.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_nvshmem.cu -c -o jacobi_nvshmem.o
+	$(NVCC) $(GENCODE_FLAGS) jacobi_nvshmem.o -o jacobi_nvshmem $(NVCC_LDFLAGS)
+
+.PHONY.: clean
+clean:
+	rm -rf jacobi_nvshmem left_shift *.o *.qdrep *.sqlite

+ 567 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/jacobi_nvshmem.cu

@@ -0,0 +1,567 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <mpi.h>
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cub/block/block_reduce.cuh>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 1024
+#define BLOCK_DIM_Y 1
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+// convert NVSHMEM_SYMMETRIC_SIZE string to long long unsigned int
+long long unsigned int parse_nvshmem_symmetric_size(char *value) {
+    long long unsigned int units, size;
+
+    assert(value != NULL);
+
+    if (strchr(value, 'G') != NULL) {
+        units=1e9;
+    } else if (strchr(value, 'M') != NULL) {
+        units=1e6;
+    } else if (strchr(value, 'K') != NULL) {
+        units=1e3;
+    } else {
+        units=1;
+    }
+
+    assert(atof(value) >= 0);
+    size = (long long unsigned int) atof(value) * units;
+
+    return size;
+}
+
+constexpr float tol = 1.0e-8;
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                      const float pi, const int offset, const int nx,
+                                      const int my_ny, int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[(iy + 1) * nx + 0] = y0;
+        a[(iy + 1) * nx + (nx - 1)] = y0;
+        a_new[(iy + 1) * nx + 0] = y0;
+        a_new[(iy + 1) * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                              float* __restrict__ const l2_norm, const int iy_start,
+                              const int iy_end, const int nx, const int top_pe, const int top_iy,
+                              const int bottom_pe, const int bottom_iy) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+
+    /* starting (x, y) coordinate of the block */
+    int block_iy = iy - threadIdx.y; /* That is, block_iy = blockIdx.y * blockDim.y + iy_start */
+    int block_ix = ix - threadIdx.x; /* That is, block_ix = blockIdx.x * blockDim.x + 1 */
+
+    /* Communicate the boundaries */
+    // TODO: Use block-level NVSHMEM put communication API to transfer the halo pointed to by 
+    // "a_new + iy_start * nx + block_ix" to "a_new + top_iy * nx + block_ix" in the "top_pe"
+    if ((block_iy <= iy_start) && (iy_start < block_iy + blockDim.y)) {
+        nvshmemx_/*Fill me*/(/*Fill me*/, /*Fill me*/,
+                                   min(blockDim.x, nx - 1 - block_ix), /*Fill me*/);
+    }
+    // TODO: Use block-level NVSHMEM put communication API to transfer the halo pointed to by 
+    // "a_new + (iy_end - 1) * nx + block_ix" to "a_new + bottom_iy * nx + block_ix" in the "bottom_pe"
+    if ((block_iy < iy_end) && (iy_end <= block_iy + blockDim.y)) {
+        nvshmemx_/*Fill me*/(/*Fill me*/, /*Fill me*/,
+                                   min(blockDim.x, nx - 1 - block_ix), /*Fill me*/);
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h,
+                  const bool print, int mype);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+struct l2_norm_buf {
+    cudaEvent_t copy_done;
+    float* d;
+    float* h;
+};
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool skip_single_gpu = get_arg(argv, argv + argc, "-skip_single_gpu");
+
+    float* a_new;
+
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float l2_norms[2];
+
+    int rank = 0, size = 1;
+    MPI_CALL(MPI_Init(&argc, &argv));
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    int num_devices;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    int local_rank = -1, local_size = 1;
+    {
+        MPI_Comm local_comm;
+        MPI_Info info;
+        MPI_CALL(MPI_Info_create(&info));
+        MPI_CALL(
+            MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, info, &local_comm));
+
+        MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+        MPI_CALL(MPI_Comm_size(local_comm, &local_size));
+        if (num_devices < local_size) {
+            fprintf(stderr,
+                    "ERROR: Number of devices is less numer of PEs \
+                    on the node!\n");
+            MPI_CALL(MPI_Comm_free(&local_comm));
+            MPI_CALL(MPI_Info_free(&info));
+            MPI_CALL(MPI_Finalize());
+            return -1;
+        }
+
+        MPI_CALL(MPI_Comm_free(&local_comm));
+        MPI_CALL(MPI_Info_free(&info));
+    }
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaFree(0));
+
+    MPI_Comm mpi_comm;
+    nvshmemx_init_attr_t attr;
+
+    mpi_comm = MPI_COMM_WORLD;
+    attr.mpi_comm = &mpi_comm;
+    // Set symmetric heap size for nvshmem based on problem size
+    // Its default value in nvshmem is 1 GB which is not sufficient
+    // for large mesh sizes
+    long long unsigned int mesh_size_per_rank = nx * (((ny - 2) + size - 1) / size + 2);
+    long long unsigned int required_symmetric_heap_size =
+        2 * mesh_size_per_rank * sizeof(float) *
+        1.1;  // Factor 2 is because 2 arrays are allocated - a and a_new
+              // 1.1 factor is just for alignment or other usage
+
+    char * value = getenv("NVSHMEM_SYMMETRIC_SIZE");
+    if (value) { /* env variable is set */
+        long long unsigned int size_env = parse_nvshmem_symmetric_size(value);
+        if (size_env < required_symmetric_heap_size) {
+            fprintf(stderr, "ERROR: Minimum NVSHMEM_SYMMETRIC_SIZE = %lluB, Current NVSHMEM_SYMMETRIC_SIZE = %s\n", required_symmetric_heap_size, value);
+            MPI_CALL(MPI_Finalize());
+            return -1;
+        }
+    } else {
+        char symmetric_heap_size_str[100];
+        sprintf(symmetric_heap_size_str, "%llu", required_symmetric_heap_size);
+        if (!rank)
+            printf("Setting environment variable NVSHMEM_SYMMETRIC_SIZE = %llu\n", required_symmetric_heap_size);
+        setenv("NVSHMEM_SYMMETRIC_SIZE", symmetric_heap_size_str, 1);
+    }
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+    int npes = nvshmem_n_pes();
+    int mype = nvshmem_my_pe();
+
+    nvshmem_barrier_all();
+
+    bool result_correct = true;
+    float* a;
+
+    cudaStream_t compute_stream;
+    cudaStream_t reset_l2_norm_stream;
+    cudaEvent_t compute_done[2];
+    cudaEvent_t reset_l2_norm_done[2];
+
+    l2_norm_buf l2_norm_bufs[2];
+
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    if (!skip_single_gpu){
+        runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, (0 == mype), mype);
+    }
+    
+    nvshmem_barrier_all();
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / npes;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = npes * chunk_size_low + npes -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (mype < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    a = (float*)nvshmem_malloc(
+        nx * (chunk_size_high + 2) *
+        sizeof(float));  // Using chunk_size_high so that it is same across all PEs
+    a_new = (float*)nvshmem_malloc(nx * (chunk_size_high + 2) * sizeof(float));
+
+    cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float));
+    cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (mype < num_ranks_low) {
+        iy_start_global = mype * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (mype - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+    // do not process boundaries
+    iy_end_global = std::min(iy_end_global, ny - 4);
+
+    int iy_start = 1;
+    int iy_end = (iy_end_global - iy_start_global + 1) + iy_start;
+
+    // calculate boundary indices for top and bottom boundaries
+    int top_pe = mype > 0 ? mype - 1 : (npes - 1);
+    int bottom_pe = (mype + 1) % npes;
+
+    int iy_end_top = (top_pe < num_ranks_low) ? chunk_size_low + 1 : chunk_size_high + 1;
+    int iy_start_bottom = 0;
+
+    // Set diriclet boundary conditions on left and right boundary
+    initialize_boundaries<<<(ny / npes) / 128 + 1, 128>>>(a, a_new, PI, iy_start_global - 1, nx,
+                                                          chunk_size, ny - 2);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    CUDA_RT_CALL(cudaStreamCreateWithFlags(&compute_stream, cudaStreamNonBlocking));
+    CUDA_RT_CALL(cudaStreamCreate(&reset_l2_norm_stream));
+    CUDA_RT_CALL(cudaEventCreate(&compute_done[0]));
+    CUDA_RT_CALL(cudaEventCreate(&compute_done[1]));
+    CUDA_RT_CALL(cudaEventCreate(&reset_l2_norm_done[0]));
+    CUDA_RT_CALL(cudaEventCreate(&reset_l2_norm_done[1]));
+
+    for (int i = 0; i < 2; ++i) {
+        CUDA_RT_CALL(cudaEventCreate(&l2_norm_bufs[i].copy_done));
+        CUDA_RT_CALL(cudaMalloc(&l2_norm_bufs[i].d, sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(l2_norm_bufs[i].d, 0, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(&l2_norm_bufs[i].h, sizeof(float)));
+        *(l2_norm_bufs[i].h) = 1.0;
+    }
+
+    nvshmemx_barrier_all_on_stream(compute_stream);
+    MPI_CALL(MPI_Allreduce(l2_norm_bufs[0].h, &l2_norms[0], 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+    MPI_CALL(MPI_Allreduce(l2_norm_bufs[1].h, &l2_norms[1], 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (!mype) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    dim3 dim_grid((nx + BLOCK_DIM_X-1) / BLOCK_DIM_X, (chunk_size + BLOCK_DIM_Y-1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    if (!mype) {
+        for (int i = 0; i < 2; ++i) {
+            l2_norms[i] = 1.0;
+        }
+    }
+
+    nvshmem_barrier_all();
+
+    double start = MPI_Wtime();
+    bool l2_norm_greater_than_tol = true;
+
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm_greater_than_tol && iter < iter_max) {
+        // on new iteration: old current vars are now previous vars, old
+        // previous vars are no longer needed
+        int prev = iter % 2;
+        int curr = (iter + 1) % 2;
+
+        // TODO: Use "cudaStreamWaitEvent" on "compute_stream" to wait for "reset_l2_norm_done"
+        // event to complete for "curr" iteration
+        CUDA_RT_CALL(cudaStreamWaitEvent(/*Fill me*/, /*Fill me*/, 0));
+        jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream>>>(
+                a_new, a, l2_norm_bufs[curr].d, iy_start, iy_end, nx, top_pe, iy_end_top, bottom_pe,
+                iy_start_bottom);
+
+        // TODO: Put a barrier at the "compute_stream" stream level
+        nvshmemx_/*Fill me*/(/*Fill me*/);
+
+        // perform L2 norm calculation
+        // as soon as computation is complete -> D2H-copy L2 norm
+        CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[curr].h, l2_norm_bufs[curr].d, sizeof(float),
+                                        cudaMemcpyDeviceToHost, compute_stream));
+        // TODO: Record the event "l2_norm_bufs[curr].copy_done" for "compute_stream"
+        CUDA_RT_CALL(cudaEventRecord(/*Fill me*/, /*Fill me*/));
+
+        // ensure previous D2H-copy is completed before using the data for
+        // calculation
+        CUDA_RT_CALL(cudaEventSynchronize(l2_norm_bufs[prev].copy_done));
+
+        MPI_CALL(MPI_Allreduce(l2_norm_bufs[prev].h, &l2_norms[prev], 1, MPI_FLOAT, MPI_SUM,
+                                MPI_COMM_WORLD));
+
+        l2_norms[prev] = std::sqrt(l2_norms[prev]);
+        l2_norm_greater_than_tol = (l2_norms[prev] > tol);
+
+        iter++;
+        if ((iter % 100) == 0) {
+            if (!mype) printf("%5d, %0.6f\n", iter, l2_norms[prev]);
+        }
+
+        // reset everything for next iteration
+        l2_norms[prev] = 0.0;
+        *(l2_norm_bufs[prev].h) = 0.0;
+        CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[prev].d, l2_norm_bufs[prev].h, sizeof(float),
+                                        cudaMemcpyHostToDevice, reset_l2_norm_stream));
+        // TODO: Record the L2 norm reset in "reset_l2_norm_done[prev]" for "reset_l2_norm_stream"
+        CUDA_RT_CALL(cudaEventRecord(/*Fill me*/, /*Fill me*/));
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvshmem_barrier_all();
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    nvshmem_barrier_all();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min(ny - 2 - iy_start_global, chunk_size) * nx * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    result_correct = true;
+    if (!skip_single_gpu) {
+        for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+            for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+                if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                    fprintf(stderr,
+                            "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                            "(reference)\n",
+                            rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                    result_correct = false;
+                }
+            }
+        }
+        int global_result_correct = 1;
+        MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                                MPI_COMM_WORLD));
+        result_correct = global_result_correct;
+    }
+
+    if (!mype && result_correct) {
+        printf("Num GPUs: %d.\n", size);
+        if (!skip_single_gpu) {
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                nx, ny, runtime_serial, size, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (size * (stop - start)) * 100);
+        }
+        else {
+            printf("%dx%d: %d GPUs: %8.4f s \n", nx, ny, size, (stop - start)); 
+        }
+    }
+
+    for (int i = 0; i < 2; ++i) {
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_bufs[i].h));
+        CUDA_RT_CALL(cudaFree(l2_norm_bufs[i].d));
+        CUDA_RT_CALL(cudaEventDestroy(l2_norm_bufs[i].copy_done));
+    }
+
+    nvshmem_free(a);
+    nvshmem_free(a_new);
+
+    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[1]));
+    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[0]));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done[1]));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done[0]));
+    CUDA_RT_CALL(cudaStreamDestroy(reset_l2_norm_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    nvshmem_finalize();
+    MPI_CALL(MPI_Finalize());
+
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h,
+                  const bool print, int mype) {
+    float* a;
+    float* a_new;
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = ny - 3;
+
+    CUDA_RT_CALL(cudaMalloc((void**)&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc((void**)&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny - 2, ny - 2);
+
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print)
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X-1) / BLOCK_DIM_X, ((ny - 2) + BLOCK_DIM_Y-1) / BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(float)));
+
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx, 
+                                                mype, iy_end + 1, mype, (iy_start - 1));
+        
+        iter++;
+        if (print && ((iter % 100) == 0)) {
+            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+            l2_norm = *l2_norm_h;
+            l2_norm = std::sqrt(l2_norm);
+            if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+    double stop = MPI_Wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 55 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/left_shift.cu

@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include "mpi.h"
+#include "nvshmem.h"
+#include "nvshmemx.h"
+
+#define CUDA_CHECK(stmt)                                  \
+do {                                                      \
+    cudaError_t result = (stmt);                          \
+    if (cudaSuccess != result) {                          \
+        fprintf(stderr, "[%s:%d] CUDA failed with %s \n", \
+         __FILE__, __LINE__, cudaGetErrorString(result)); \
+        exit(-1);                                         \
+    }                                                     \
+} while (0)
+
+__global__ void simple_shift(int *destination) {
+    int mype = nvshmem_my_pe();
+    int npes = nvshmem_n_pes();
+    int peer = (mype + 1) % npes;
+
+    nvshmem_int_p(destination, mype, peer);
+}
+
+int main (int argc, char *argv[]) {
+    int mype_node, msg;
+    cudaStream_t stream;
+    int rank, nranks;
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+    mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
+
+    CUDA_CHECK(cudaSetDevice(mype_node));
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    int *destination = (int *) nvshmem_malloc (sizeof(int));
+
+    simple_shift<<<1, 1, 0, stream>>>(destination);
+    nvshmemx_barrier_all_on_stream(stream);
+    CUDA_CHECK(cudaMemcpyAsync(&msg, destination, sizeof(int),
+                cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    printf("%d: received message %d\n", nvshmem_my_pe(), msg);
+
+    nvshmem_free(destination);
+    nvshmem_finalize();
+    MPI_Finalize();
+    return 0;
+}

+ 555 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/nvshmem/solution/jacobi_nvshmem.cu

@@ -0,0 +1,555 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <mpi.h>
+#include <nvshmem.h>
+#include <nvshmemx.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <cub/block/block_reduce.cuh>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 1024
+#define BLOCK_DIM_Y 1
+
+#define MPI_CALL(call)                                                                \
+    {                                                                                 \
+        int mpi_status = call;                                                        \
+        if (0 != mpi_status) {                                                        \
+            char mpi_error_string[MPI_MAX_ERROR_STRING];                              \
+            int mpi_error_string_length = 0;                                          \
+            MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \
+            if (NULL != mpi_error_string)                                             \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %s "                                                    \
+                        "(%d).\n",                                                    \
+                        #call, __LINE__, __FILE__, mpi_error_string, mpi_status);     \
+            else                                                                      \
+                fprintf(stderr,                                                       \
+                        "ERROR: MPI call \"%s\" in line %d of file %s failed "        \
+                        "with %d.\n",                                                 \
+                        #call, __LINE__, __FILE__, mpi_status);                       \
+        }                                                                             \
+    }
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+// convert NVSHMEM_SYMMETRIC_SIZE string to long long unsigned int
+long long unsigned int parse_nvshmem_symmetric_size(char *value) {
+    long long unsigned int units, size;
+
+    assert(value != NULL);
+
+    if (strchr(value, 'G') != NULL) {
+        units=1e9;
+    } else if (strchr(value, 'M') != NULL) {
+        units=1e6;
+    } else if (strchr(value, 'K') != NULL) {
+        units=1e3;
+    } else {
+        units=1;
+    }
+
+    assert(atof(value) >= 0);
+    size = (long long unsigned int) atof(value) * units;
+
+    return size;
+}
+
+constexpr float tol = 1.0e-8;
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float* __restrict__ const a_new, float* __restrict__ const a,
+                                      const float pi, const int offset, const int nx,
+                                      const int my_ny, int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[(iy + 1) * nx + 0] = y0;
+        a[(iy + 1) * nx + (nx - 1)] = y0;
+        a_new[(iy + 1) * nx + 0] = y0;
+        a_new[(iy + 1) * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float* __restrict__ const a_new, const float* __restrict__ const a,
+                              float* __restrict__ const l2_norm, const int iy_start,
+                              const int iy_end, const int nx, const int top_pe, const int top_iy,
+                              const int bottom_pe, const int bottom_iy) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+        // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+        float residue = new_val - a[iy * nx + ix];
+        // Set block-level L2 norm value for this grid point
+        block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+        block_l2_sum[thread_index] = 0;
+    }
+
+    /* starting (x, y) coordinate of the block */
+    int block_iy =
+        iy - threadIdx.y; /* Alternatively, block_iy = blockIdx.y * blockDim.y + iy_start */
+    int block_ix = ix - threadIdx.x; /* Alternatively, block_ix = blockIdx.x * blockDim.x + 1 */
+
+    /* Communicate the boundaries */
+    if ((block_iy <= iy_start) && (iy_start < block_iy + blockDim.y)) {
+        nvshmemx_float_put_nbi_block(a_new + top_iy * nx + block_ix, a_new + iy_start * nx + block_ix,
+                                   min(blockDim.x, nx - 1 - block_ix), top_pe);
+    }
+    if ((block_iy < iy_end) && (iy_end <= block_iy + blockDim.y)) {
+        nvshmemx_float_put_nbi_block(a_new + bottom_iy * nx + block_ix,
+                                   a_new + (iy_end - 1) * nx + block_ix,
+                                   min(blockDim.x, nx - 1 - block_ix), bottom_pe);
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+        __syncthreads();
+        if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+        }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+        atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h,
+                  const bool print, int mype);
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+struct l2_norm_buf {
+    cudaEvent_t copy_done;
+    float* d;
+    float* h;
+};
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+
+    float* a_new;
+
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float l2_norms[2];
+
+    int rank = 0, size = 1;
+    MPI_CALL(MPI_Init(&argc, &argv));
+    MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    int num_devices;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+
+    int local_rank = -1, local_size = 1;
+    {
+        MPI_Comm local_comm;
+        MPI_Info info;
+        MPI_CALL(MPI_Info_create(&info));
+        MPI_CALL(
+            MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, info, &local_comm));
+
+        MPI_CALL(MPI_Comm_rank(local_comm, &local_rank));
+        MPI_CALL(MPI_Comm_size(local_comm, &local_size));
+        if (num_devices < local_size) {
+            fprintf(stderr,
+                    "ERROR: Number of devices is less numer of PEs \
+                    on the node!\n");
+            MPI_CALL(MPI_Comm_free(&local_comm));
+            MPI_CALL(MPI_Info_free(&info));
+            MPI_CALL(MPI_Finalize());
+            return -1;
+        }
+
+        MPI_CALL(MPI_Comm_free(&local_comm));
+        MPI_CALL(MPI_Info_free(&info));
+    }
+    CUDA_RT_CALL(cudaSetDevice(local_rank));
+    CUDA_RT_CALL(cudaFree(0));
+
+    MPI_Comm mpi_comm;
+    nvshmemx_init_attr_t attr;
+
+    mpi_comm = MPI_COMM_WORLD;
+    attr.mpi_comm = &mpi_comm;
+    // Set symmetric heap size for nvshmem based on problem size
+    // Its default value in nvshmem is 1 GB which is not sufficient
+    // for large mesh sizes
+    long long unsigned int mesh_size_per_rank = nx * (((ny - 2) + size - 1) / size + 2);
+    long long unsigned int required_symmetric_heap_size =
+        2 * mesh_size_per_rank * sizeof(float) *
+        1.1;  // Factor 2 is because 2 arrays are allocated - a and a_new
+              // 1.1 factor is just for alignment or other usage
+
+    char * value = getenv("NVSHMEM_SYMMETRIC_SIZE");
+    if (value) { /* env variable is set */
+        long long unsigned int size_env = parse_nvshmem_symmetric_size(value);
+        if (size_env < required_symmetric_heap_size) {
+            fprintf(stderr, "ERROR: Minimum NVSHMEM_SYMMETRIC_SIZE = %lluB, Current NVSHMEM_SYMMETRIC_SIZE = %s\n", required_symmetric_heap_size, value);
+            MPI_CALL(MPI_Finalize());
+            return -1;
+        }
+    } else {
+        char symmetric_heap_size_str[100];
+        sprintf(symmetric_heap_size_str, "%llu", required_symmetric_heap_size);
+        if (!rank)
+            printf("Setting environment variable NVSHMEM_SYMMETRIC_SIZE = %llu\n", required_symmetric_heap_size);
+        setenv("NVSHMEM_SYMMETRIC_SIZE", symmetric_heap_size_str, 1);
+    }
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+    int npes = nvshmem_n_pes();
+    int mype = nvshmem_my_pe();
+
+    nvshmem_barrier_all();
+
+    bool result_correct = true;
+    float* a;
+
+    cudaStream_t compute_stream;
+    cudaStream_t reset_l2_norm_stream;
+    cudaEvent_t compute_done[2];
+    cudaEvent_t reset_l2_norm_done[2];
+
+    l2_norm_buf l2_norm_bufs[2];
+
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+    runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, (0 == mype), mype);
+
+    nvshmem_barrier_all();
+    // ny - 2 rows are distributed amongst `size` ranks in such a way
+    // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+    // This optimizes load balancing when (ny - 2) % size != 0
+    int chunk_size;
+    int chunk_size_low = (ny - 2) / npes;
+    int chunk_size_high = chunk_size_low + 1;
+    // To calculate the number of ranks that need to compute an extra row,
+    // the following formula is derived from this equation:
+    // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
+    int num_ranks_low = npes * chunk_size_low + npes -
+                        (ny - 2);  // Number of ranks with chunk_size = chunk_size_low
+    if (mype < num_ranks_low)
+        chunk_size = chunk_size_low;
+    else
+        chunk_size = chunk_size_high;
+
+    a = (float*)nvshmem_malloc(
+        nx * (chunk_size_high + 2) *
+        sizeof(float));  // Using chunk_size_high so that it is same across all PEs
+    a_new = (float*)nvshmem_malloc(nx * (chunk_size_high + 2) * sizeof(float));
+
+    cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(float));
+    cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(float));
+
+    // Calculate local domain boundaries
+    int iy_start_global;  // My start index in the global array
+    if (mype < num_ranks_low) {
+        iy_start_global = mype * chunk_size_low + 1;
+    } else {
+        iy_start_global =
+            num_ranks_low * chunk_size_low + (mype - num_ranks_low) * chunk_size_high + 1;
+    }
+    int iy_end_global = iy_start_global + chunk_size - 1;  // My last index in the global array
+    // do not process boundaries
+    iy_end_global = std::min(iy_end_global, ny - 4);
+
+    int iy_start = 1;
+    int iy_end = (iy_end_global - iy_start_global + 1) + iy_start;
+
+    // calculate boundary indices for top and bottom boundaries
+    int top_pe = mype > 0 ? mype - 1 : (npes - 1);
+    int bottom_pe = (mype + 1) % npes;
+
+    int iy_end_top = (top_pe < num_ranks_low) ? chunk_size_low + 1 : chunk_size_high + 1;
+    int iy_start_bottom = 0;
+
+    // Set diriclet boundary conditions on left and right boundary
+    initialize_boundaries<<<(ny / npes) / 128 + 1, 128>>>(a, a_new, PI, iy_start_global - 1, nx,
+                                                          chunk_size, ny - 2);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    CUDA_RT_CALL(cudaStreamCreateWithFlags(&compute_stream, cudaStreamNonBlocking));
+    CUDA_RT_CALL(cudaStreamCreate(&reset_l2_norm_stream));
+    CUDA_RT_CALL(cudaEventCreate(&compute_done[0]));
+    CUDA_RT_CALL(cudaEventCreate(&compute_done[1]));
+    CUDA_RT_CALL(cudaEventCreate(&reset_l2_norm_done[0]));
+    CUDA_RT_CALL(cudaEventCreate(&reset_l2_norm_done[1]));
+
+    for (int i = 0; i < 2; ++i) {
+        CUDA_RT_CALL(cudaEventCreate(&l2_norm_bufs[i].copy_done));
+        CUDA_RT_CALL(cudaMalloc(&l2_norm_bufs[i].d, sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(l2_norm_bufs[i].d, 0, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(&l2_norm_bufs[i].h, sizeof(float)));
+        *(l2_norm_bufs[i].h) = 1.0;
+    }
+
+    nvshmemx_barrier_all_on_stream(compute_stream);
+    MPI_CALL(MPI_Allreduce(l2_norm_bufs[0].h, &l2_norms[0], 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+    MPI_CALL(MPI_Allreduce(l2_norm_bufs[1].h, &l2_norms[1], 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (!mype) {
+        printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+    }
+
+    dim3 dim_grid((nx + BLOCK_DIM_X-1) / BLOCK_DIM_X, (chunk_size + BLOCK_DIM_Y-1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    if (!mype) {
+        for (int i = 0; i < 2; ++i) {
+            l2_norms[i] = 1.0;
+        }
+    }
+
+    nvshmem_barrier_all();
+
+    double start = MPI_Wtime();
+    bool l2_norm_greater_than_tol = true;
+
+    nvtxRangePush("Jacobi Solve Multi-GPU");
+    while (l2_norm_greater_than_tol && iter < iter_max) {
+        // on new iteration: old current vars are now previous vars, old
+        // previous vars are no longer needed
+        int prev = iter % 2;
+        int curr = (iter + 1) % 2;
+
+        CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, reset_l2_norm_done[curr], 0));
+        jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream>>>(
+                a_new, a, l2_norm_bufs[curr].d, iy_start, iy_end, nx, top_pe, iy_end_top, bottom_pe,
+                iy_start_bottom);
+
+        /* Instead of using nvshmemx_barrier_all_on_stream, we are using a custom implementation
+           of barrier that just synchronizes with the neighbor PEs that is the PEs with whom a PE
+           communicates. This will perform faster than a global barrier that would do redundant
+           synchronization for this application. */
+        nvshmemx_barrier_all_on_stream(compute_stream);
+
+        // perform L2 norm calculation
+        // as soon as computation is complete -> D2H-copy L2 norm
+        CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[curr].h, l2_norm_bufs[curr].d, sizeof(float),
+                                        cudaMemcpyDeviceToHost, compute_stream));
+        CUDA_RT_CALL(cudaEventRecord(l2_norm_bufs[curr].copy_done, compute_stream));
+
+        // ensure previous D2H-copy is completed before using the data for
+        // calculation
+        CUDA_RT_CALL(cudaEventSynchronize(l2_norm_bufs[prev].copy_done));
+
+        MPI_CALL(MPI_Allreduce(l2_norm_bufs[prev].h, &l2_norms[prev], 1, MPI_FLOAT, MPI_SUM,
+                                MPI_COMM_WORLD));
+
+        l2_norms[prev] = std::sqrt(l2_norms[prev]);
+        l2_norm_greater_than_tol = (l2_norms[prev] > tol);
+
+        iter++;
+        if ((iter % 100) == 0) {
+            if (!mype) printf("%5d, %0.6f\n", iter, l2_norms[prev]);
+        }
+
+        // reset everything for next iteration
+        l2_norms[prev] = 0.0;
+        *(l2_norm_bufs[prev].h) = 0.0;
+        CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_bufs[prev].d, l2_norm_bufs[prev].h, sizeof(float),
+                                        cudaMemcpyHostToDevice, reset_l2_norm_stream));
+        CUDA_RT_CALL(cudaEventRecord(reset_l2_norm_done[prev], reset_l2_norm_stream));
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvshmem_barrier_all();
+    double stop = MPI_Wtime();
+    nvtxRangePop();
+
+    nvshmem_barrier_all();
+
+    CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx,
+                            std::min(ny - 2 - iy_start_global, chunk_size) * nx * sizeof(float),
+                            cudaMemcpyDeviceToHost));
+
+    result_correct = true;
+    for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    int global_result_correct = 1;
+    MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN,
+                           MPI_COMM_WORLD));
+    result_correct = global_result_correct;
+
+    if (!mype && result_correct) {
+            printf("Num GPUs: %d.\n", npes);
+            printf(
+                "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+                "efficiency: %8.2f \n",
+                ny, nx, runtime_serial, npes, (stop - start), runtime_serial / (stop - start),
+                runtime_serial / (npes * (stop - start)) * 100);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_bufs[i].h));
+        CUDA_RT_CALL(cudaFree(l2_norm_bufs[i].d));
+        CUDA_RT_CALL(cudaEventDestroy(l2_norm_bufs[i].copy_done));
+    }
+
+    nvshmem_free(a);
+    nvshmem_free(a_new);
+
+    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[1]));
+    CUDA_RT_CALL(cudaEventDestroy(reset_l2_norm_done[0]));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done[1]));
+    CUDA_RT_CALL(cudaEventDestroy(compute_done[0]));
+    CUDA_RT_CALL(cudaStreamDestroy(reset_l2_norm_stream));
+    CUDA_RT_CALL(cudaStreamDestroy(compute_stream));
+
+    CUDA_RT_CALL(cudaFreeHost(a_h));
+    CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+
+    nvshmem_finalize();
+    MPI_CALL(MPI_Finalize());
+
+    return (result_correct == 1) ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h,
+                  const bool print, int mype) {
+    float* a;
+    float* a_new;
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = ny - 3;
+
+    CUDA_RT_CALL(cudaMalloc((void**)&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc((void**)&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny - 2, ny - 2);
+
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    if (print)
+        printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X-1) / BLOCK_DIM_X, ((ny - 2) + BLOCK_DIM_Y-1) / BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    double start = MPI_Wtime();
+    nvtxRangePush("Jacobi Solve Single GPU");
+
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(float)));
+
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx, 
+                                                mype, iy_end + 1, mype, (iy_start - 1));
+        
+        iter++;
+        if (print && ((iter % 100) == 0)) {
+            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+            l2_norm = *l2_norm_h;
+            l2_norm = std::sqrt(l2_norm);
+            if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+        }
+
+        std::swap(a_new, a);
+    }
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+    double stop = MPI_Wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}

+ 22 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h

@@ -0,0 +1,22 @@
+#ifndef  __FREEGLUT_H__
+#define  __FREEGLUT_H__
+
+/*
+ * freeglut.h
+ *
+ * The freeglut library include file
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "freeglut_std.h"
+#include "freeglut_ext.h"
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_H__ */

+ 115 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h

@@ -0,0 +1,115 @@
+#ifndef  __FREEGLUT_EXT_H__
+#define  __FREEGLUT_EXT_H__
+
+/*
+ * freeglut_ext.h
+ *
+ * The non-GLUT-compatible extensions to the freeglut library include file
+ *
+ * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved.
+ * Written by Pawel W. Olszta, <olszta@sourceforge.net>
+ * Creation date: Thu Dec 2 1999
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * GLUT API Extension macro definitions -- behaviour when the user clicks on an "x" to close a window
+ */
+#define GLUT_ACTION_EXIT                         0
+#define GLUT_ACTION_GLUTMAINLOOP_RETURNS         1
+#define GLUT_ACTION_CONTINUE_EXECUTION           2
+
+/*
+ * Create a new rendering context when the user opens a new window?
+ */
+#define GLUT_CREATE_NEW_CONTEXT                  0
+#define GLUT_USE_CURRENT_CONTEXT                 1
+
+/*
+ * GLUT API Extension macro definitions -- the glutGet parameters
+ */
+#define  GLUT_ACTION_ON_WINDOW_CLOSE        0x01F9
+
+#define  GLUT_WINDOW_BORDER_WIDTH           0x01FA
+#define  GLUT_WINDOW_HEADER_HEIGHT          0x01FB
+
+#define  GLUT_VERSION                       0x01FC
+
+#define  GLUT_RENDERING_CONTEXT             0x01FD
+
+/*
+ * Process loop function, see freeglut_main.c
+ */
+FGAPI void    FGAPIENTRY glutMainLoopEvent(void);
+FGAPI void    FGAPIENTRY glutLeaveMainLoop(void);
+
+/*
+ * Window-specific callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutMouseWheelFunc(void (* callback)(int, int, int, int));
+FGAPI void    FGAPIENTRY glutCloseFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutWMCloseFunc(void (* callback)(void));
+/* A. Donev: Also a destruction callback for menus */
+FGAPI void    FGAPIENTRY glutMenuDestroyFunc(void (* callback)(void));
+
+/*
+ * State setting and retrieval functions, see freeglut_state.c
+ */
+FGAPI void    FGAPIENTRY glutSetOption(GLenum option_flag, int value) ;
+/* A.Donev: User-data manipulation */
+FGAPI void   *FGAPIENTRY glutGetWindowData(void);
+FGAPI void    FGAPIENTRY glutSetWindowData(void *data);
+FGAPI void   *FGAPIENTRY glutGetMenuData(void);
+FGAPI void    FGAPIENTRY glutSetMenuData(void *data);
+
+/*
+ * Font stuff, see freeglut_font.c
+ */
+FGAPI int     FGAPIENTRY glutBitmapHeight(void *font);
+FGAPI GLfloat FGAPIENTRY glutStrokeHeight(void *font);
+FGAPI void    FGAPIENTRY glutBitmapString(void *font, const unsigned char *string);
+FGAPI void    FGAPIENTRY glutStrokeString(void *font, const unsigned char *string);
+
+/*
+ * Geometry functions, see freeglut_geometry.c
+ */
+FGAPI void    FGAPIENTRY glutWireRhombicDodecahedron(void);
+FGAPI void    FGAPIENTRY glutSolidRhombicDodecahedron(void);
+FGAPI void    FGAPIENTRY glutWireSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
+FGAPI void    FGAPIENTRY glutSolidSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
+FGAPI void    FGAPIENTRY glutWireCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
+
+/*
+ * Extension functions, see freeglut_ext.c
+ */
+FGAPI void *FGAPIENTRY glutGetProcAddress(const char *procName);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_EXT_H__ */

+ 547 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_std.h

@@ -0,0 +1,547 @@
+#ifndef  __FREEGLUT_STD_H__
+#define  __FREEGLUT_STD_H__
+
+/*
+ * freeglut_std.h
+ *
+ * The GLUT-compatible part of the freeglut library include file
+ *
+ * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved.
+ * Written by Pawel W. Olszta, <olszta@sourceforge.net>
+ * Creation date: Thu Dec 2 1999
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Under windows, we have to differentiate between static and dynamic libraries
+ */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#    include <windows.h>
+#    include <windowsx.h>
+#    include <mmsystem.h>
+#    define WINDOWS
+#ifdef FREEGLUT_STATIC
+#    define FGAPI
+#    define FGAPIENTRY
+
+#    pragma comment (lib, "freeglut_static.lib")    /* link with Win32 static freeglut lib */
+
+#else
+
+#        if defined(FREEGLUT_EXPORTS)
+#                define FGAPI __declspec(dllexport)
+/* #                define FGAPI */
+#        else
+#                define FGAPI __declspec(dllimport)
+#   pragma comment (lib, "freeglut.lib")    /* link with Win32 freeglut lib */
+#        endif
+#        define FGAPIENTRY __stdcall
+
+#endif
+
+#pragma comment (lib, "winmm.lib")       /* link with Windows MultiMedia lib */
+#pragma comment (lib, "user32.lib") /* link with Windows user lib */
+#pragma comment (lib, "gdi32.lib") /* link with Windows GDI lib */
+#pragma comment (lib, "opengl32.lib")    /* link with Microsoft OpenGL lib */
+#pragma comment (lib, "glu32.lib")       /* link with OpenGL Utility lib */
+
+
+#else
+#        define FGAPI
+#        define FGAPIENTRY
+#endif
+
+/*
+ * The freeglut and GLUT API versions
+ */
+#define  FREEGLUT             1
+#define  GLUT_API_VERSION     4
+#define  FREEGLUT_VERSION_2_0 1
+
+/*
+ * Always include OpenGL and GLU headers
+ */
+#include <GL/gl.h>
+#include <GL/glu.h>
+
+/*
+ * GLUT API macro definitions -- the special key codes:
+ */
+#define  GLUT_KEY_F1                        0x0001
+#define  GLUT_KEY_F2                        0x0002
+#define  GLUT_KEY_F3                        0x0003
+#define  GLUT_KEY_F4                        0x0004
+#define  GLUT_KEY_F5                        0x0005
+#define  GLUT_KEY_F6                        0x0006
+#define  GLUT_KEY_F7                        0x0007
+#define  GLUT_KEY_F8                        0x0008
+#define  GLUT_KEY_F9                        0x0009
+#define  GLUT_KEY_F10                       0x000A
+#define  GLUT_KEY_F11                       0x000B
+#define  GLUT_KEY_F12                       0x000C
+#define  GLUT_KEY_LEFT                      0x0064
+#define  GLUT_KEY_UP                        0x0065
+#define  GLUT_KEY_RIGHT                     0x0066
+#define  GLUT_KEY_DOWN                      0x0067
+#define  GLUT_KEY_PAGE_UP                   0x0068
+#define  GLUT_KEY_PAGE_DOWN                 0x0069
+#define  GLUT_KEY_HOME                      0x006A
+#define  GLUT_KEY_END                       0x006B
+#define  GLUT_KEY_INSERT                    0x006C
+
+/*
+ * GLUT API macro definitions -- mouse state definitions
+ */
+#define  GLUT_LEFT_BUTTON                   0x0000
+#define  GLUT_MIDDLE_BUTTON                 0x0001
+#define  GLUT_RIGHT_BUTTON                  0x0002
+#define  GLUT_DOWN                          0x0000
+#define  GLUT_UP                            0x0001
+#define  GLUT_LEFT                          0x0000
+#define  GLUT_ENTERED                       0x0001
+
+/*
+ * GLUT API macro definitions -- the display mode definitions
+ */
+#define  GLUT_RGB                           0x0000
+#define  GLUT_RGBA                          0x0000
+#define  GLUT_INDEX                         0x0001
+#define  GLUT_SINGLE                        0x0000
+#define  GLUT_DOUBLE                        0x0002
+#define  GLUT_ACCUM                         0x0004
+#define  GLUT_ALPHA                         0x0008
+#define  GLUT_DEPTH                         0x0010
+#define  GLUT_STENCIL                       0x0020
+#define  GLUT_MULTISAMPLE                   0x0080
+#define  GLUT_STEREO                        0x0100
+#define  GLUT_LUMINANCE                     0x0200
+
+/*
+ * GLUT API macro definitions -- windows and menu related definitions
+ */
+#define  GLUT_MENU_NOT_IN_USE               0x0000
+#define  GLUT_MENU_IN_USE                   0x0001
+#define  GLUT_NOT_VISIBLE                   0x0000
+#define  GLUT_VISIBLE                       0x0001
+#define  GLUT_HIDDEN                        0x0000
+#define  GLUT_FULLY_RETAINED                0x0001
+#define  GLUT_PARTIALLY_RETAINED            0x0002
+#define  GLUT_FULLY_COVERED                 0x0003
+
+/*
+ * GLUT API macro definitions -- fonts definitions
+ *
+ * Steve Baker suggested to make it binary compatible with GLUT:
+ */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#   define  GLUT_STROKE_ROMAN               ((void *)0x0000)
+#   define  GLUT_STROKE_MONO_ROMAN          ((void *)0x0001)
+#   define  GLUT_BITMAP_9_BY_15             ((void *)0x0002)
+#   define  GLUT_BITMAP_8_BY_13             ((void *)0x0003)
+#   define  GLUT_BITMAP_TIMES_ROMAN_10      ((void *)0x0004)
+#   define  GLUT_BITMAP_TIMES_ROMAN_24      ((void *)0x0005)
+#   define  GLUT_BITMAP_HELVETICA_10        ((void *)0x0006)
+#   define  GLUT_BITMAP_HELVETICA_12        ((void *)0x0007)
+#   define  GLUT_BITMAP_HELVETICA_18        ((void *)0x0008)
+#else
+/*
+ * I don't really know if it's a good idea... But here it goes:
+ */
+extern void *glutStrokeRoman;
+extern void *glutStrokeMonoRoman;
+extern void *glutBitmap9By15;
+extern void *glutBitmap8By13;
+extern void *glutBitmapTimesRoman10;
+extern void *glutBitmapTimesRoman24;
+extern void *glutBitmapHelvetica10;
+extern void *glutBitmapHelvetica12;
+extern void *glutBitmapHelvetica18;
+
+/*
+ * Those pointers will be used by following definitions:
+ */
+#   define  GLUT_STROKE_ROMAN               ((void *) &glutStrokeRoman)
+#   define  GLUT_STROKE_MONO_ROMAN          ((void *) &glutStrokeMonoRoman)
+#   define  GLUT_BITMAP_9_BY_15             ((void *) &glutBitmap9By15)
+#   define  GLUT_BITMAP_8_BY_13             ((void *) &glutBitmap8By13)
+#   define  GLUT_BITMAP_TIMES_ROMAN_10      ((void *) &glutBitmapTimesRoman10)
+#   define  GLUT_BITMAP_TIMES_ROMAN_24      ((void *) &glutBitmapTimesRoman24)
+#   define  GLUT_BITMAP_HELVETICA_10        ((void *) &glutBitmapHelvetica10)
+#   define  GLUT_BITMAP_HELVETICA_12        ((void *) &glutBitmapHelvetica12)
+#   define  GLUT_BITMAP_HELVETICA_18        ((void *) &glutBitmapHelvetica18)
+#endif
+
+/*
+ * GLUT API macro definitions -- the glutGet parameters
+ */
+#define  GLUT_WINDOW_X                      0x0064
+#define  GLUT_WINDOW_Y                      0x0065
+#define  GLUT_WINDOW_WIDTH                  0x0066
+#define  GLUT_WINDOW_HEIGHT                 0x0067
+#define  GLUT_WINDOW_BUFFER_SIZE            0x0068
+#define  GLUT_WINDOW_STENCIL_SIZE           0x0069
+#define  GLUT_WINDOW_DEPTH_SIZE             0x006A
+#define  GLUT_WINDOW_RED_SIZE               0x006B
+#define  GLUT_WINDOW_GREEN_SIZE             0x006C
+#define  GLUT_WINDOW_BLUE_SIZE              0x006D
+#define  GLUT_WINDOW_ALPHA_SIZE             0x006E
+#define  GLUT_WINDOW_ACCUM_RED_SIZE         0x006F
+#define  GLUT_WINDOW_ACCUM_GREEN_SIZE       0x0070
+#define  GLUT_WINDOW_ACCUM_BLUE_SIZE        0x0071
+#define  GLUT_WINDOW_ACCUM_ALPHA_SIZE       0x0072
+#define  GLUT_WINDOW_DOUBLEBUFFER           0x0073
+#define  GLUT_WINDOW_RGBA                   0x0074
+#define  GLUT_WINDOW_PARENT                 0x0075
+#define  GLUT_WINDOW_NUM_CHILDREN           0x0076
+#define  GLUT_WINDOW_COLORMAP_SIZE          0x0077
+#define  GLUT_WINDOW_NUM_SAMPLES            0x0078
+#define  GLUT_WINDOW_STEREO                 0x0079
+#define  GLUT_WINDOW_CURSOR                 0x007A
+
+#define  GLUT_SCREEN_WIDTH                  0x00C8
+#define  GLUT_SCREEN_HEIGHT                 0x00C9
+#define  GLUT_SCREEN_WIDTH_MM               0x00CA
+#define  GLUT_SCREEN_HEIGHT_MM              0x00CB
+#define  GLUT_MENU_NUM_ITEMS                0x012C
+#define  GLUT_DISPLAY_MODE_POSSIBLE         0x0190
+#define  GLUT_INIT_WINDOW_X                 0x01F4
+#define  GLUT_INIT_WINDOW_Y                 0x01F5
+#define  GLUT_INIT_WINDOW_WIDTH             0x01F6
+#define  GLUT_INIT_WINDOW_HEIGHT            0x01F7
+#define  GLUT_INIT_DISPLAY_MODE             0x01F8
+#define  GLUT_ELAPSED_TIME                  0x02BC
+#define  GLUT_WINDOW_FORMAT_ID              0x007B
+#define  GLUT_INIT_STATE                    0x007C
+
+/*
+ * GLUT API macro definitions -- the glutDeviceGet parameters
+ */
+#define  GLUT_HAS_KEYBOARD                  0x0258
+#define  GLUT_HAS_MOUSE                     0x0259
+#define  GLUT_HAS_SPACEBALL                 0x025A
+#define  GLUT_HAS_DIAL_AND_BUTTON_BOX       0x025B
+#define  GLUT_HAS_TABLET                    0x025C
+#define  GLUT_NUM_MOUSE_BUTTONS             0x025D
+#define  GLUT_NUM_SPACEBALL_BUTTONS         0x025E
+#define  GLUT_NUM_BUTTON_BOX_BUTTONS        0x025F
+#define  GLUT_NUM_DIALS                     0x0260
+#define  GLUT_NUM_TABLET_BUTTONS            0x0261
+#define  GLUT_DEVICE_IGNORE_KEY_REPEAT      0x0262
+#define  GLUT_DEVICE_KEY_REPEAT             0x0263
+#define  GLUT_HAS_JOYSTICK                  0x0264
+#define  GLUT_OWNS_JOYSTICK                 0x0265
+#define  GLUT_JOYSTICK_BUTTONS              0x0266
+#define  GLUT_JOYSTICK_AXES                 0x0267
+#define  GLUT_JOYSTICK_POLL_RATE            0x0268
+
+/*
+ * GLUT API macro definitions -- the glutLayerGet parameters
+ */
+#define  GLUT_OVERLAY_POSSIBLE              0x0320
+#define  GLUT_LAYER_IN_USE                  0x0321
+#define  GLUT_HAS_OVERLAY                   0x0322
+#define  GLUT_TRANSPARENT_INDEX             0x0323
+#define  GLUT_NORMAL_DAMAGED                0x0324
+#define  GLUT_OVERLAY_DAMAGED               0x0325
+
+/*
+ * GLUT API macro definitions -- the glutVideoResizeGet parameters
+ */
+#define  GLUT_VIDEO_RESIZE_POSSIBLE         0x0384
+#define  GLUT_VIDEO_RESIZE_IN_USE           0x0385
+#define  GLUT_VIDEO_RESIZE_X_DELTA          0x0386
+#define  GLUT_VIDEO_RESIZE_Y_DELTA          0x0387
+#define  GLUT_VIDEO_RESIZE_WIDTH_DELTA      0x0388
+#define  GLUT_VIDEO_RESIZE_HEIGHT_DELTA     0x0389
+#define  GLUT_VIDEO_RESIZE_X                0x038A
+#define  GLUT_VIDEO_RESIZE_Y                0x038B
+#define  GLUT_VIDEO_RESIZE_WIDTH            0x038C
+#define  GLUT_VIDEO_RESIZE_HEIGHT           0x038D
+
+/*
+ * GLUT API macro definitions -- the glutUseLayer parameters
+ */
+#define  GLUT_NORMAL                        0x0000
+#define  GLUT_OVERLAY                       0x0001
+
+/*
+ * GLUT API macro definitions -- the glutGetModifiers parameters
+ */
+#define  GLUT_ACTIVE_SHIFT                  0x0001
+#define  GLUT_ACTIVE_CTRL                   0x0002
+#define  GLUT_ACTIVE_ALT                    0x0004
+
+/*
+ * GLUT API macro definitions -- the glutSetCursor parameters
+ */
+#define  GLUT_CURSOR_RIGHT_ARROW            0x0000
+#define  GLUT_CURSOR_LEFT_ARROW             0x0001
+#define  GLUT_CURSOR_INFO                   0x0002
+#define  GLUT_CURSOR_DESTROY                0x0003
+#define  GLUT_CURSOR_HELP                   0x0004
+#define  GLUT_CURSOR_CYCLE                  0x0005
+#define  GLUT_CURSOR_SPRAY                  0x0006
+#define  GLUT_CURSOR_WAIT                   0x0007
+#define  GLUT_CURSOR_TEXT                   0x0008
+#define  GLUT_CURSOR_CROSSHAIR              0x0009
+#define  GLUT_CURSOR_UP_DOWN                0x000A
+#define  GLUT_CURSOR_LEFT_RIGHT             0x000B
+#define  GLUT_CURSOR_TOP_SIDE               0x000C
+#define  GLUT_CURSOR_BOTTOM_SIDE            0x000D
+#define  GLUT_CURSOR_LEFT_SIDE              0x000E
+#define  GLUT_CURSOR_RIGHT_SIDE             0x000F
+#define  GLUT_CURSOR_TOP_LEFT_CORNER        0x0010
+#define  GLUT_CURSOR_TOP_RIGHT_CORNER       0x0011
+#define  GLUT_CURSOR_BOTTOM_RIGHT_CORNER    0x0012
+#define  GLUT_CURSOR_BOTTOM_LEFT_CORNER     0x0013
+#define  GLUT_CURSOR_INHERIT                0x0064
+#define  GLUT_CURSOR_NONE                   0x0065
+#define  GLUT_CURSOR_FULL_CROSSHAIR         0x0066
+
+/*
+ * GLUT API macro definitions -- RGB color component specification definitions
+ */
+#define  GLUT_RED                           0x0000
+#define  GLUT_GREEN                         0x0001
+#define  GLUT_BLUE                          0x0002
+
+/*
+ * GLUT API macro definitions -- additional keyboard and joystick definitions
+ */
+#define  GLUT_KEY_REPEAT_OFF                0x0000
+#define  GLUT_KEY_REPEAT_ON                 0x0001
+#define  GLUT_KEY_REPEAT_DEFAULT            0x0002
+
+#define  GLUT_JOYSTICK_BUTTON_A             0x0001
+#define  GLUT_JOYSTICK_BUTTON_B             0x0002
+#define  GLUT_JOYSTICK_BUTTON_C             0x0004
+#define  GLUT_JOYSTICK_BUTTON_D             0x0008
+
+/*
+ * GLUT API macro definitions -- game mode definitions
+ */
+#define  GLUT_GAME_MODE_ACTIVE              0x0000
+#define  GLUT_GAME_MODE_POSSIBLE            0x0001
+#define  GLUT_GAME_MODE_WIDTH               0x0002
+#define  GLUT_GAME_MODE_HEIGHT              0x0003
+#define  GLUT_GAME_MODE_PIXEL_DEPTH         0x0004
+#define  GLUT_GAME_MODE_REFRESH_RATE        0x0005
+#define  GLUT_GAME_MODE_DISPLAY_CHANGED     0x0006
+
+/*
+ * Initialization functions, see fglut_init.c
+ */
+FGAPI void    FGAPIENTRY glutInit(int *pargc, char **argv);
+FGAPI void    FGAPIENTRY glutInitWindowPosition(int x, int y);
+FGAPI void    FGAPIENTRY glutInitWindowSize(int width, int height);
+FGAPI void    FGAPIENTRY glutInitDisplayMode(unsigned int displayMode);
+FGAPI void    FGAPIENTRY glutInitDisplayString(const char *displayMode);
+
+/*
+ * Process loop function, see freeglut_main.c
+ */
+FGAPI void    FGAPIENTRY glutMainLoop(void);
+
+/*
+ * Window management functions, see freeglut_window.c
+ */
+FGAPI int     FGAPIENTRY glutCreateWindow(const char *title);
+FGAPI int     FGAPIENTRY glutCreateSubWindow(int window, int x, int y, int width, int height);
+FGAPI void    FGAPIENTRY glutDestroyWindow(int window);
+FGAPI void    FGAPIENTRY glutSetWindow(int window);
+FGAPI int     FGAPIENTRY glutGetWindow(void);
+FGAPI void    FGAPIENTRY glutSetWindowTitle(const char *title);
+FGAPI void    FGAPIENTRY glutSetIconTitle(const char *title);
+FGAPI void    FGAPIENTRY glutReshapeWindow(int width, int height);
+FGAPI void    FGAPIENTRY glutPositionWindow(int x, int y);
+FGAPI void    FGAPIENTRY glutShowWindow(void);
+FGAPI void    FGAPIENTRY glutHideWindow(void);
+FGAPI void    FGAPIENTRY glutIconifyWindow(void);
+FGAPI void    FGAPIENTRY glutPushWindow(void);
+FGAPI void    FGAPIENTRY glutPopWindow(void);
+FGAPI void    FGAPIENTRY glutFullScreen(void);
+
+/*
+ * Display-connected functions, see freeglut_display.c
+ */
+FGAPI void    FGAPIENTRY glutPostWindowRedisplay(int window);
+FGAPI void    FGAPIENTRY glutPostRedisplay(void);
+FGAPI void    FGAPIENTRY glutSwapBuffers(void);
+
+/*
+ * Mouse cursor functions, see freeglut_cursor.c
+ */
+FGAPI void    FGAPIENTRY glutWarpPointer(int x, int y);
+FGAPI void    FGAPIENTRY glutSetCursor(int cursor);
+
+/*
+ * Overlay stuff, see freeglut_overlay.c
+ */
+FGAPI void    FGAPIENTRY glutEstablishOverlay(void);
+FGAPI void    FGAPIENTRY glutRemoveOverlay(void);
+FGAPI void    FGAPIENTRY glutUseLayer(GLenum layer);
+FGAPI void    FGAPIENTRY glutPostOverlayRedisplay(void);
+FGAPI void    FGAPIENTRY glutPostWindowOverlayRedisplay(int window);
+FGAPI void    FGAPIENTRY glutShowOverlay(void);
+FGAPI void    FGAPIENTRY glutHideOverlay(void);
+
+/*
+ * Menu stuff, see freeglut_menu.c
+ */
+FGAPI int     FGAPIENTRY glutCreateMenu(void (* callback)(int menu));
+FGAPI void    FGAPIENTRY glutDestroyMenu(int menu);
+FGAPI int     FGAPIENTRY glutGetMenu(void);
+FGAPI void    FGAPIENTRY glutSetMenu(int menu);
+FGAPI void    FGAPIENTRY glutAddMenuEntry(const char *label, int value);
+FGAPI void    FGAPIENTRY glutAddSubMenu(const char *label, int subMenu);
+FGAPI void    FGAPIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
+FGAPI void    FGAPIENTRY glutChangeToSubMenu(int item, const char *label, int value);
+FGAPI void    FGAPIENTRY glutRemoveMenuItem(int item);
+FGAPI void    FGAPIENTRY glutAttachMenu(int button);
+FGAPI void    FGAPIENTRY glutDetachMenu(int button);
+
+/*
+ * Global callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutTimerFunc(unsigned int time, void (* callback)(int), int value);
+FGAPI void    FGAPIENTRY glutIdleFunc(void (* callback)(void));
+
+/*
+ * Window-specific callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutKeyboardFunc(void (* callback)(unsigned char, int, int));
+FGAPI void    FGAPIENTRY glutSpecialFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutReshapeFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutVisibilityFunc(void (* callback)(int));
+FGAPI void    FGAPIENTRY glutDisplayFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutMouseFunc(void (* callback)(int, int, int, int));
+FGAPI void    FGAPIENTRY glutMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutPassiveMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutEntryFunc(void (* callback)(int));
+
+FGAPI void    FGAPIENTRY glutKeyboardUpFunc(void (* callback)(unsigned char, int, int));
+FGAPI void    FGAPIENTRY glutSpecialUpFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutJoystickFunc(void (* callback)(unsigned int, int, int, int), int pollInterval);
+FGAPI void    FGAPIENTRY glutMenuStateFunc(void (* callback)(int));
+FGAPI void    FGAPIENTRY glutMenuStatusFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutOverlayDisplayFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutWindowStatusFunc(void (* callback)(int));
+
+FGAPI void    FGAPIENTRY glutSpaceballMotionFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutSpaceballRotateFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutSpaceballButtonFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutButtonBoxFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutDialsFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutTabletMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutTabletButtonFunc(void (* callback)(int, int, int, int));
+
+/*
+ * State setting and retrieval functions, see freeglut_state.c
+ */
+FGAPI int     FGAPIENTRY glutGet(GLenum query);
+FGAPI int     FGAPIENTRY glutDeviceGet(GLenum query);
+FGAPI int     FGAPIENTRY glutGetModifiers(void);
+FGAPI int     FGAPIENTRY glutLayerGet(GLenum query);
+
+/*
+ * Font stuff, see freeglut_font.c
+ */
+FGAPI void    FGAPIENTRY glutBitmapCharacter(void *font, int character);
+FGAPI int     FGAPIENTRY glutBitmapWidth(void *font, int character);
+FGAPI void    FGAPIENTRY glutStrokeCharacter(void *font, int character);
+FGAPI int     FGAPIENTRY glutStrokeWidth(void *font, int character);
+FGAPI int     FGAPIENTRY glutBitmapLength(void *font, const unsigned char *string);
+FGAPI int     FGAPIENTRY glutStrokeLength(void *font, const unsigned char *string);
+
+/*
+ * Geometry functions, see freeglut_geometry.c
+ */
+FGAPI void    FGAPIENTRY glutWireCube(GLdouble size);
+FGAPI void    FGAPIENTRY glutSolidCube(GLdouble size);
+FGAPI void    FGAPIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+
+FGAPI void    FGAPIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+FGAPI void    FGAPIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+FGAPI void    FGAPIENTRY glutWireDodecahedron(void);
+FGAPI void    FGAPIENTRY glutSolidDodecahedron(void);
+FGAPI void    FGAPIENTRY glutWireOctahedron(void);
+FGAPI void    FGAPIENTRY glutSolidOctahedron(void);
+FGAPI void    FGAPIENTRY glutWireTetrahedron(void);
+FGAPI void    FGAPIENTRY glutSolidTetrahedron(void);
+FGAPI void    FGAPIENTRY glutWireIcosahedron(void);
+FGAPI void    FGAPIENTRY glutSolidIcosahedron(void);
+
+/*
+ * Teapot rendering functions, found in freeglut_teapot.c
+ */
+FGAPI void    FGAPIENTRY glutWireTeapot(GLdouble size);
+FGAPI void    FGAPIENTRY glutSolidTeapot(GLdouble size);
+
+/*
+ * Game mode functions, see freeglut_gamemode.c
+ */
+FGAPI void    FGAPIENTRY glutGameModeString(const char *string);
+FGAPI int     FGAPIENTRY glutEnterGameMode(void);
+FGAPI void    FGAPIENTRY glutLeaveGameMode(void);
+FGAPI int     FGAPIENTRY glutGameModeGet(GLenum query);
+
+/*
+ * Video resize functions, see freeglut_videoresize.c
+ */
+FGAPI int     FGAPIENTRY glutVideoResizeGet(GLenum query);
+FGAPI void    FGAPIENTRY glutSetupVideoResizing(void);
+FGAPI void    FGAPIENTRY glutStopVideoResizing(void);
+FGAPI void    FGAPIENTRY glutVideoResize(int x, int y, int width, int height);
+FGAPI void    FGAPIENTRY glutVideoPan(int x, int y, int width, int height);
+
+/*
+ * Colormap functions, see freeglut_misc.c
+ */
+FGAPI void    FGAPIENTRY glutSetColor(int color, GLfloat red, GLfloat green, GLfloat blue);
+FGAPI GLfloat FGAPIENTRY glutGetColor(int color, int component);
+FGAPI void    FGAPIENTRY glutCopyColormap(int window);
+
+/*
+ * Misc keyboard and joystick functions, see freeglut_misc.c
+ */
+FGAPI void    FGAPIENTRY glutIgnoreKeyRepeat(int ignore);
+FGAPI void    FGAPIENTRY glutSetKeyRepeat(int repeatMode);    /* DEPRECATED 11/4/02 - Do not use */
+FGAPI void    FGAPIENTRY glutForceJoystickFunc(void);
+
+/*
+ * Misc functions, see freeglut_misc.c
+ */
+FGAPI int     FGAPIENTRY glutExtensionSupported(const char *extension);
+FGAPI void    FGAPIENTRY glutReportErrors(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_STD_H__ */
+

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 14457 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glew.h


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 7125 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glext.h


+ 597 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glut.h

@@ -0,0 +1,597 @@
+#ifndef __glut_h__
+#define __glut_h__
+
+/* Copyright (c) Mark J. Kilgard, 1994, 1995, 1996, 1998. */
+
+/* This program is freely distributable without licensing fees  and is
+   provided without guarantee or warrantee expressed or  implied. This
+   program is -not- in the public domain. */
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+
+/* GLUT 3.7 now tries to avoid including <windows.h>
+   to avoid name space pollution, but Win32's <GL/gl.h>
+   needs APIENTRY and WINGDIAPI defined properly. */
+# if 0
+#  define  WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+# else
+/* XXX This is from Win32's <windef.h> */
+#  ifndef APIENTRY
+#   define GLUT_APIENTRY_DEFINED
+#   if (_MSC_VER >= 800) || defined(_STDCALL_SUPPORTED)
+#    define APIENTRY    __stdcall
+#   else
+#    define APIENTRY
+#   endif
+#  endif
+/* XXX This is from Win32's <winnt.h> */
+#  ifndef CALLBACK
+#   if (defined(_M_MRX000) || defined(_M_IX86) || defined(_M_ALPHA) || defined(_M_PPC)) && !defined(MIDL_PASS)
+#    define CALLBACK __stdcall
+#   else
+#    define CALLBACK
+#   endif
+#  endif
+/* XXX This is from Win32's <wingdi.h> and <winnt.h> */
+#  ifndef WINGDIAPI
+#   define GLUT_WINGDIAPI_DEFINED
+#   define WINGDIAPI __declspec(dllimport)
+#  endif
+/* XXX This is from Win32's <ctype.h> */
+#  ifndef _WCHAR_T_DEFINED
+typedef unsigned short wchar_t;
+#   define _WCHAR_T_DEFINED
+#  endif
+# endif
+
+#pragma comment (lib, "winmm.lib")     /* link with Windows MultiMedia lib */
+#pragma comment (lib, "opengl32.lib")  /* link with Microsoft OpenGL lib */
+#pragma comment (lib, "glu32.lib")     /* link with OpenGL Utility lib */
+#pragma message("Note: including lib: glut32.lib\n")
+#pragma comment (lib, "glut32.lib")    /* link with Win32 GLUT lib */
+
+#pragma warning (disable:4244)  /* Disable bogus conversion warnings. */
+#pragma warning (disable:4305)  /* VC++ 5.0 version of above warning. */
+
+#endif
+
+#include <GL/gl.h>
+#include <GL/glu.h>
+
+/* define APIENTRY and CALLBACK to null string if we aren't on Win32 */
+#if !defined(WIN32)
+#define APIENTRY
+#define GLUT_APIENTRY_DEFINED
+#define CALLBACK
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ GLUT API revision history:
+
+ GLUT_API_VERSION is updated to reflect incompatible GLUT
+ API changes (interface changes, semantic changes, deletions,
+ or additions).
+
+ GLUT_API_VERSION=1  First public release of GLUT.  11/29/94
+
+ GLUT_API_VERSION=2  Added support for OpenGL/GLX multisampling,
+ extension.  Supports new input devices like tablet, dial and button
+ box, and Spaceball.  Easy to query OpenGL extensions.
+
+ GLUT_API_VERSION=3  glutMenuStatus added.
+
+ GLUT_API_VERSION=4  glutInitDisplayString, glutWarpPointer,
+ glutBitmapLength, glutStrokeLength, glutWindowStatusFunc, dynamic
+ video resize subAPI, glutPostWindowRedisplay, glutKeyboardUpFunc,
+ glutSpecialUpFunc, glutIgnoreKeyRepeat, glutSetKeyRepeat,
+ glutJoystickFunc, glutForceJoystickFunc (NOT FINALIZED!).
+**/
+#ifndef GLUT_API_VERSION  /* allow this to be overriden */
+#define GLUT_API_VERSION        3
+#endif
+
+/**
+ GLUT implementation revision history:
+
+ GLUT_XLIB_IMPLEMENTATION is updated to reflect both GLUT
+ API revisions and implementation revisions (ie, bug fixes).
+
+ GLUT_XLIB_IMPLEMENTATION=1  mjk's first public release of
+ GLUT Xlib-based implementation.  11/29/94
+
+ GLUT_XLIB_IMPLEMENTATION=2  mjk's second public release of
+ GLUT Xlib-based implementation providing GLUT version 2
+ interfaces.
+
+ GLUT_XLIB_IMPLEMENTATION=3  mjk's GLUT 2.2 images. 4/17/95
+
+ GLUT_XLIB_IMPLEMENTATION=4  mjk's GLUT 2.3 images. 6/?/95
+
+ GLUT_XLIB_IMPLEMENTATION=5  mjk's GLUT 3.0 images. 10/?/95
+
+ GLUT_XLIB_IMPLEMENTATION=7  mjk's GLUT 3.1+ with glutWarpPoitner.  7/24/96
+
+ GLUT_XLIB_IMPLEMENTATION=8  mjk's GLUT 3.1+ with glutWarpPoitner
+ and video resize.  1/3/97
+
+ GLUT_XLIB_IMPLEMENTATION=9 mjk's GLUT 3.4 release with early GLUT 4 routines.
+
+ GLUT_XLIB_IMPLEMENTATION=11 Mesa 2.5's GLUT 3.6 release.
+
+ GLUT_XLIB_IMPLEMENTATION=12 mjk's GLUT 3.6 release with early GLUT 4 routines + signal handling.
+
+ GLUT_XLIB_IMPLEMENTATION=13 mjk's GLUT 3.7 release with GameGLUT support.
+**/
+#ifndef GLUT_XLIB_IMPLEMENTATION  /* Allow this to be overriden. */
+#define GLUT_XLIB_IMPLEMENTATION    13
+#endif
+
+/* Display mode bit masks. */
+#define GLUT_RGB            0
+#define GLUT_RGBA           GLUT_RGB
+#define GLUT_INDEX          1
+#define GLUT_SINGLE         0
+#define GLUT_DOUBLE         2
+#define GLUT_ACCUM          4
+#define GLUT_ALPHA          8
+#define GLUT_DEPTH          16
+#define GLUT_STENCIL            32
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_MULTISAMPLE        128
+#define GLUT_STEREO         256
+#endif
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_LUMINANCE          512
+#endif
+
+/* Mouse buttons. */
+#define GLUT_LEFT_BUTTON        0
+#define GLUT_MIDDLE_BUTTON      1
+#define GLUT_RIGHT_BUTTON       2
+
+/* Mouse button  state. */
+#define GLUT_DOWN           0
+#define GLUT_UP             1
+
+#if (GLUT_API_VERSION >= 2)
+/* function keys */
+#define GLUT_KEY_F1         1
+#define GLUT_KEY_F2         2
+#define GLUT_KEY_F3         3
+#define GLUT_KEY_F4         4
+#define GLUT_KEY_F5         5
+#define GLUT_KEY_F6         6
+#define GLUT_KEY_F7         7
+#define GLUT_KEY_F8         8
+#define GLUT_KEY_F9         9
+#define GLUT_KEY_F10            10
+#define GLUT_KEY_F11            11
+#define GLUT_KEY_F12            12
+/* directional keys */
+#define GLUT_KEY_LEFT           100
+#define GLUT_KEY_UP         101
+#define GLUT_KEY_RIGHT          102
+#define GLUT_KEY_DOWN           103
+#define GLUT_KEY_PAGE_UP        104
+#define GLUT_KEY_PAGE_DOWN      105
+#define GLUT_KEY_HOME           106
+#define GLUT_KEY_END            107
+#define GLUT_KEY_INSERT         108
+#endif
+
+/* Entry/exit  state. */
+#define GLUT_LEFT           0
+#define GLUT_ENTERED            1
+
+/* Menu usage  state. */
+#define GLUT_MENU_NOT_IN_USE        0
+#define GLUT_MENU_IN_USE        1
+
+/* Visibility  state. */
+#define GLUT_NOT_VISIBLE        0
+#define GLUT_VISIBLE            1
+
+/* Window status  state. */
+#define GLUT_HIDDEN         0
+#define GLUT_FULLY_RETAINED     1
+#define GLUT_PARTIALLY_RETAINED     2
+#define GLUT_FULLY_COVERED      3
+
+/* Color index component selection values. */
+#define GLUT_RED            0
+#define GLUT_GREEN          1
+#define GLUT_BLUE           2
+
+/* Layers for use. */
+#define GLUT_NORMAL         0
+#define GLUT_OVERLAY            1
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+/* Stroke font constants (use these in GLUT program). */
+#define GLUT_STROKE_ROMAN       ((void*)0)
+#define GLUT_STROKE_MONO_ROMAN      ((void*)1)
+
+/* Bitmap font constants (use these in GLUT program). */
+#define GLUT_BITMAP_9_BY_15     ((void*)2)
+#define GLUT_BITMAP_8_BY_13     ((void*)3)
+#define GLUT_BITMAP_TIMES_ROMAN_10  ((void*)4)
+#define GLUT_BITMAP_TIMES_ROMAN_24  ((void*)5)
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_BITMAP_HELVETICA_10    ((void*)6)
+#define GLUT_BITMAP_HELVETICA_12    ((void*)7)
+#define GLUT_BITMAP_HELVETICA_18    ((void*)8)
+#endif
+#else
+/* Stroke font opaque addresses (use constants instead in source code). */
+extern void *glutStrokeRoman;
+extern void *glutStrokeMonoRoman;
+
+/* Stroke font constants (use these in GLUT program). */
+#define GLUT_STROKE_ROMAN       (&glutStrokeRoman)
+#define GLUT_STROKE_MONO_ROMAN      (&glutStrokeMonoRoman)
+
+/* Bitmap font opaque addresses (use constants instead in source code). */
+extern void *glutBitmap9By15;
+extern void *glutBitmap8By13;
+extern void *glutBitmapTimesRoman10;
+extern void *glutBitmapTimesRoman24;
+extern void *glutBitmapHelvetica10;
+extern void *glutBitmapHelvetica12;
+extern void *glutBitmapHelvetica18;
+
+/* Bitmap font constants (use these in GLUT program). */
+#define GLUT_BITMAP_9_BY_15     (&glutBitmap9By15)
+#define GLUT_BITMAP_8_BY_13     (&glutBitmap8By13)
+#define GLUT_BITMAP_TIMES_ROMAN_10  (&glutBitmapTimesRoman10)
+#define GLUT_BITMAP_TIMES_ROMAN_24  (&glutBitmapTimesRoman24)
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_BITMAP_HELVETICA_10    (&glutBitmapHelvetica10)
+#define GLUT_BITMAP_HELVETICA_12    (&glutBitmapHelvetica12)
+#define GLUT_BITMAP_HELVETICA_18    (&glutBitmapHelvetica18)
+#endif
+#endif
+
+/* glutGet parameters. */
+#define GLUT_WINDOW_X           100
+#define GLUT_WINDOW_Y           101
+#define GLUT_WINDOW_WIDTH       102
+#define GLUT_WINDOW_HEIGHT      103
+#define GLUT_WINDOW_BUFFER_SIZE     104
+#define GLUT_WINDOW_STENCIL_SIZE    105
+#define GLUT_WINDOW_DEPTH_SIZE      106
+#define GLUT_WINDOW_RED_SIZE        107
+#define GLUT_WINDOW_GREEN_SIZE      108
+#define GLUT_WINDOW_BLUE_SIZE       109
+#define GLUT_WINDOW_ALPHA_SIZE      110
+#define GLUT_WINDOW_ACCUM_RED_SIZE  111
+#define GLUT_WINDOW_ACCUM_GREEN_SIZE    112
+#define GLUT_WINDOW_ACCUM_BLUE_SIZE 113
+#define GLUT_WINDOW_ACCUM_ALPHA_SIZE    114
+#define GLUT_WINDOW_DOUBLEBUFFER    115
+#define GLUT_WINDOW_RGBA        116
+#define GLUT_WINDOW_PARENT      117
+#define GLUT_WINDOW_NUM_CHILDREN    118
+#define GLUT_WINDOW_COLORMAP_SIZE   119
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_WINDOW_NUM_SAMPLES     120
+#define GLUT_WINDOW_STEREO      121
+#endif
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_WINDOW_CURSOR      122
+#endif
+#define GLUT_SCREEN_WIDTH       200
+#define GLUT_SCREEN_HEIGHT      201
+#define GLUT_SCREEN_WIDTH_MM        202
+#define GLUT_SCREEN_HEIGHT_MM       203
+#define GLUT_MENU_NUM_ITEMS     300
+#define GLUT_DISPLAY_MODE_POSSIBLE  400
+#define GLUT_INIT_WINDOW_X      500
+#define GLUT_INIT_WINDOW_Y      501
+#define GLUT_INIT_WINDOW_WIDTH      502
+#define GLUT_INIT_WINDOW_HEIGHT     503
+#define GLUT_INIT_DISPLAY_MODE      504
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_ELAPSED_TIME       700
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+#define GLUT_WINDOW_FORMAT_ID       123
+#endif
+
+#if (GLUT_API_VERSION >= 2)
+/* glutDeviceGet parameters. */
+#define GLUT_HAS_KEYBOARD       600
+#define GLUT_HAS_MOUSE          601
+#define GLUT_HAS_SPACEBALL      602
+#define GLUT_HAS_DIAL_AND_BUTTON_BOX    603
+#define GLUT_HAS_TABLET         604
+#define GLUT_NUM_MOUSE_BUTTONS      605
+#define GLUT_NUM_SPACEBALL_BUTTONS  606
+#define GLUT_NUM_BUTTON_BOX_BUTTONS 607
+#define GLUT_NUM_DIALS          608
+#define GLUT_NUM_TABLET_BUTTONS     609
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+#define GLUT_DEVICE_IGNORE_KEY_REPEAT   610
+#define GLUT_DEVICE_KEY_REPEAT          611
+#define GLUT_HAS_JOYSTICK       612
+#define GLUT_OWNS_JOYSTICK      613
+#define GLUT_JOYSTICK_BUTTONS       614
+#define GLUT_JOYSTICK_AXES      615
+#define GLUT_JOYSTICK_POLL_RATE     616
+#endif
+
+#if (GLUT_API_VERSION >= 3)
+/* glutLayerGet parameters. */
+#define GLUT_OVERLAY_POSSIBLE           800
+#define GLUT_LAYER_IN_USE       801
+#define GLUT_HAS_OVERLAY        802
+#define GLUT_TRANSPARENT_INDEX      803
+#define GLUT_NORMAL_DAMAGED     804
+#define GLUT_OVERLAY_DAMAGED        805
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+/* glutVideoResizeGet parameters. */
+#define GLUT_VIDEO_RESIZE_POSSIBLE  900
+#define GLUT_VIDEO_RESIZE_IN_USE    901
+#define GLUT_VIDEO_RESIZE_X_DELTA   902
+#define GLUT_VIDEO_RESIZE_Y_DELTA   903
+#define GLUT_VIDEO_RESIZE_WIDTH_DELTA   904
+#define GLUT_VIDEO_RESIZE_HEIGHT_DELTA  905
+#define GLUT_VIDEO_RESIZE_X     906
+#define GLUT_VIDEO_RESIZE_Y     907
+#define GLUT_VIDEO_RESIZE_WIDTH     908
+#define GLUT_VIDEO_RESIZE_HEIGHT    909
+#endif
+
+/* glutUseLayer parameters. */
+#define GLUT_NORMAL         0
+#define GLUT_OVERLAY            1
+
+/* glutGetModifiers return mask. */
+#define GLUT_ACTIVE_SHIFT               1
+#define GLUT_ACTIVE_CTRL                2
+#define GLUT_ACTIVE_ALT                 4
+
+/* glutSetCursor parameters. */
+/* Basic arrows. */
+#define GLUT_CURSOR_RIGHT_ARROW     0
+#define GLUT_CURSOR_LEFT_ARROW      1
+/* Symbolic cursor shapes. */
+#define GLUT_CURSOR_INFO        2
+#define GLUT_CURSOR_DESTROY     3
+#define GLUT_CURSOR_HELP        4
+#define GLUT_CURSOR_CYCLE       5
+#define GLUT_CURSOR_SPRAY       6
+#define GLUT_CURSOR_WAIT        7
+#define GLUT_CURSOR_TEXT        8
+#define GLUT_CURSOR_CROSSHAIR       9
+/* Directional cursors. */
+#define GLUT_CURSOR_UP_DOWN     10
+#define GLUT_CURSOR_LEFT_RIGHT      11
+/* Sizing cursors. */
+#define GLUT_CURSOR_TOP_SIDE        12
+#define GLUT_CURSOR_BOTTOM_SIDE     13
+#define GLUT_CURSOR_LEFT_SIDE       14
+#define GLUT_CURSOR_RIGHT_SIDE      15
+#define GLUT_CURSOR_TOP_LEFT_CORNER 16
+#define GLUT_CURSOR_TOP_RIGHT_CORNER    17
+#define GLUT_CURSOR_BOTTOM_RIGHT_CORNER 18
+#define GLUT_CURSOR_BOTTOM_LEFT_CORNER  19
+/* Inherit from parent window. */
+#define GLUT_CURSOR_INHERIT     100
+/* Blank cursor. */
+#define GLUT_CURSOR_NONE        101
+/* Fullscreen crosshair (if available). */
+#define GLUT_CURSOR_FULL_CROSSHAIR  102
+#endif
+
+/* GLUT initialization sub-API. */
+extern void APIENTRY glutInit(int *argcp, char **argv);
+extern void APIENTRY glutInitDisplayMode(unsigned int mode);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutInitDisplayString(const char *string);
+#endif
+extern void APIENTRY glutInitWindowPosition(int x, int y);
+extern void APIENTRY glutInitWindowSize(int width, int height);
+extern void APIENTRY glutMainLoop(void);
+
+/* GLUT window sub-API. */
+extern int APIENTRY glutCreateWindow(const char *title);
+extern int APIENTRY glutCreateSubWindow(int win, int x, int y, int width, int height);
+extern void APIENTRY glutDestroyWindow(int win);
+extern void APIENTRY glutPostRedisplay(void);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
+extern void APIENTRY glutPostWindowRedisplay(int win);
+#endif
+extern void APIENTRY glutSwapBuffers(void);
+extern int APIENTRY glutGetWindow(void);
+extern void APIENTRY glutSetWindow(int win);
+extern void APIENTRY glutSetWindowTitle(const char *title);
+extern void APIENTRY glutSetIconTitle(const char *title);
+extern void APIENTRY glutPositionWindow(int x, int y);
+extern void APIENTRY glutReshapeWindow(int width, int height);
+extern void APIENTRY glutPopWindow(void);
+extern void APIENTRY glutPushWindow(void);
+extern void APIENTRY glutIconifyWindow(void);
+extern void APIENTRY glutShowWindow(void);
+extern void APIENTRY glutHideWindow(void);
+#if (GLUT_API_VERSION >= 3)
+extern void APIENTRY glutFullScreen(void);
+extern void APIENTRY glutSetCursor(int cursor);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutWarpPointer(int x, int y);
+#endif
+
+/* GLUT overlay sub-API. */
+extern void APIENTRY glutEstablishOverlay(void);
+extern void APIENTRY glutRemoveOverlay(void);
+extern void APIENTRY glutUseLayer(GLenum layer);
+extern void APIENTRY glutPostOverlayRedisplay(void);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
+extern void APIENTRY glutPostWindowOverlayRedisplay(int win);
+#endif
+extern void APIENTRY glutShowOverlay(void);
+extern void APIENTRY glutHideOverlay(void);
+#endif
+
+/* GLUT menu sub-API. */
+extern int APIENTRY glutCreateMenu(void ( *)(int));
+extern void APIENTRY glutDestroyMenu(int menu);
+extern int APIENTRY glutGetMenu(void);
+extern void APIENTRY glutSetMenu(int menu);
+extern void APIENTRY glutAddMenuEntry(const char *label, int value);
+extern void APIENTRY glutAddSubMenu(const char *label, int submenu);
+extern void APIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
+extern void APIENTRY glutChangeToSubMenu(int item, const char *label, int submenu);
+extern void APIENTRY glutRemoveMenuItem(int item);
+extern void APIENTRY glutAttachMenu(int button);
+extern void APIENTRY glutDetachMenu(int button);
+
+/* GLUT window callback sub-API. */
+extern void APIENTRY glutDisplayFunc(void (*func)(void));
+extern void APIENTRY glutReshapeFunc(void (*func)(int width, int height));
+extern void APIENTRY glutKeyboardFunc(void (*func)(unsigned char key, int x, int y));
+extern void APIENTRY glutMouseFunc(void (*func)(int button, int state, int x, int y));
+extern void APIENTRY glutMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutPassiveMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutEntryFunc(void (*func)(int state));
+extern void APIENTRY glutVisibilityFunc(void (*func)(int state));
+extern void APIENTRY glutIdleFunc(void (*func)(void));
+extern void APIENTRY glutTimerFunc(unsigned int millis, void (*func)(int value), int value);
+extern void APIENTRY glutMenuStateFunc(void (*func)(int state));
+#if (GLUT_API_VERSION >= 2)
+extern void APIENTRY glutSpecialFunc(void (*func)(int key, int x, int y));
+extern void APIENTRY glutSpaceballMotionFunc(void (*func)(int x, int y, int z));
+extern void APIENTRY glutSpaceballRotateFunc(void (*func)(int x, int y, int z));
+extern void APIENTRY glutSpaceballButtonFunc(void (*func)(int button, int state));
+extern void APIENTRY glutButtonBoxFunc(void (*func)(int button, int state));
+extern void APIENTRY glutDialsFunc(void (*func)(int dial, int value));
+extern void APIENTRY glutTabletMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutTabletButtonFunc(void (*func)(int button, int state, int x, int y));
+#if (GLUT_API_VERSION >= 3)
+extern void APIENTRY glutMenuStatusFunc(void (*func)(int status, int x, int y));
+extern void APIENTRY glutOverlayDisplayFunc(void (*func)(void));
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutWindowStatusFunc(void (*func)(int state));
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+extern void APIENTRY glutKeyboardUpFunc(void (*func)(unsigned char key, int x, int y));
+extern void APIENTRY glutSpecialUpFunc(void (*func)(int key, int x, int y));
+extern void APIENTRY glutJoystickFunc(void (*func)(unsigned int buttonMask, int x, int y, int z), int pollInterval);
+#endif
+#endif
+#endif
+
+/* GLUT color index sub-API. */
+extern void APIENTRY glutSetColor(int, GLfloat red, GLfloat green, GLfloat blue);
+extern GLfloat APIENTRY glutGetColor(int ndx, int component);
+extern void APIENTRY glutCopyColormap(int win);
+
+/* GLUT state retrieval sub-API. */
+extern int APIENTRY glutGet(GLenum type);
+extern int APIENTRY glutDeviceGet(GLenum type);
+#if (GLUT_API_VERSION >= 2)
+/* GLUT extension support sub-API */
+extern int APIENTRY glutExtensionSupported(const char *name);
+#endif
+#if (GLUT_API_VERSION >= 3)
+extern int APIENTRY glutGetModifiers(void);
+extern int APIENTRY glutLayerGet(GLenum type);
+#endif
+
+/* GLUT font sub-API */
+extern void APIENTRY glutBitmapCharacter(void *font, int character);
+extern int APIENTRY glutBitmapWidth(void *font, int character);
+extern void APIENTRY glutStrokeCharacter(void *font, int character);
+extern int APIENTRY glutStrokeWidth(void *font, int character);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern int APIENTRY glutBitmapLength(void *font, const unsigned char *string);
+extern int APIENTRY glutStrokeLength(void *font, const unsigned char *string);
+#endif
+
+/* GLUT pre-built models sub-API */
+extern void APIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
+extern void APIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
+extern void APIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+extern void APIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+extern void APIENTRY glutWireCube(GLdouble size);
+extern void APIENTRY glutSolidCube(GLdouble size);
+extern void APIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+extern void APIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+extern void APIENTRY glutWireDodecahedron(void);
+extern void APIENTRY glutSolidDodecahedron(void);
+extern void APIENTRY glutWireTeapot(GLdouble size);
+extern void APIENTRY glutSolidTeapot(GLdouble size);
+extern void APIENTRY glutWireOctahedron(void);
+extern void APIENTRY glutSolidOctahedron(void);
+extern void APIENTRY glutWireTetrahedron(void);
+extern void APIENTRY glutSolidTetrahedron(void);
+extern void APIENTRY glutWireIcosahedron(void);
+extern void APIENTRY glutSolidIcosahedron(void);
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+/* GLUT video resize sub-API. */
+extern int APIENTRY glutVideoResizeGet(GLenum param);
+extern void APIENTRY glutSetupVideoResizing(void);
+extern void APIENTRY glutStopVideoResizing(void);
+extern void APIENTRY glutVideoResize(int x, int y, int width, int height);
+extern void APIENTRY glutVideoPan(int x, int y, int width, int height);
+
+/* GLUT debugging sub-API. */
+extern void APIENTRY glutReportErrors(void);
+#endif
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+/* GLUT device control sub-API. */
+/* glutSetKeyRepeat modes. */
+#define GLUT_KEY_REPEAT_OFF     0
+#define GLUT_KEY_REPEAT_ON      1
+#define GLUT_KEY_REPEAT_DEFAULT     2
+
+/* Joystick button masks. */
+#define GLUT_JOYSTICK_BUTTON_A      1
+#define GLUT_JOYSTICK_BUTTON_B      2
+#define GLUT_JOYSTICK_BUTTON_C      4
+#define GLUT_JOYSTICK_BUTTON_D      8
+
+extern void APIENTRY glutIgnoreKeyRepeat(int ignore);
+extern void APIENTRY glutSetKeyRepeat(int repeatMode);
+extern void APIENTRY glutForceJoystickFunc(void);
+
+/* GLUT game mode sub-API. */
+/* glutGameModeGet. */
+#define GLUT_GAME_MODE_ACTIVE           0
+#define GLUT_GAME_MODE_POSSIBLE         1
+#define GLUT_GAME_MODE_WIDTH            2
+#define GLUT_GAME_MODE_HEIGHT           3
+#define GLUT_GAME_MODE_PIXEL_DEPTH      4
+#define GLUT_GAME_MODE_REFRESH_RATE     5
+#define GLUT_GAME_MODE_DISPLAY_CHANGED  6
+
+extern void APIENTRY glutGameModeString(const char *string);
+extern int APIENTRY glutEnterGameMode(void);
+extern void APIENTRY glutLeaveGameMode(void);
+extern int APIENTRY glutGameModeGet(GLenum mode);
+#endif
+
+#ifdef __cplusplus
+}
+
+#endif
+
+#ifdef GLUT_APIENTRY_DEFINED
+# undef GLUT_APIENTRY_DEFINED
+# undef APIENTRY
+#endif
+
+#ifdef GLUT_WINGDIAPI_DEFINED
+# undef GLUT_WINGDIAPI_DEFINED
+# undef WINGDIAPI
+#endif
+
+#endif                  /* __glut_h__ */

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1121 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxew.h


+ 0 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxext.h


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.