Browse Source

Added multi-GPU via cudamemcpy an streams notebook and code

Anish Saxena 4 years ago
parent
commit
841afa04e0
91 changed files with 40500 additions and 227 deletions
  1. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/bin/x86_64/linux/release/p2pBandwidthLatencyTest
  2. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png
  3. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/domain_decomposition.png
  4. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/gpu_programming_process.png
  5. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/halo_exchange.png
  6. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/intra_node_topology_map.png
  7. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_host_staging.png
  8. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_p2p_overview.png
  9. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_serialized.png
  10. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpyasync_parallel.png
  11. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_cli_sample_output.png
  12. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_overview.png
  13. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_single_gpu_analysis.png
  14. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png
  15. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_topo_output.png
  16. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/open_terminal_session.png
  17. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png
  18. 569 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
  19. 243 0
      hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb
  20. 9 22
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/Makefile
  21. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy
  22. 462 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy.cu
  23. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.qdrep
  24. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.sqlite
  25. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.qdrep
  26. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.sqlite
  27. 450 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_streams.cu
  28. 22 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h
  29. 115 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h
  30. 547 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_std.h
  31. 14457 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glew.h
  32. 7125 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glext.h
  33. 597 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glut.h
  34. 1121 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxew.h
  35. 805 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxext.h
  36. 958 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/wglew.h
  37. 696 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/wglext.h
  38. 197 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Exceptions.h
  39. 155 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Image.h
  40. 80 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsCPU.h
  41. 1139 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsNPP.h
  42. 149 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageIO.h
  43. 171 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagePacked.h
  44. 121 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesCPU.h
  45. 149 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesNPP.h
  46. 126 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Pixel.h
  47. 168 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Signal.h
  48. 66 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsCPU.h
  49. 684 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsNPP.h
  50. 107 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsCPU.h
  51. 113 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsNPP.h
  52. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw
  53. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw
  54. 1 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw
  55. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw
  56. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw
  57. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw
  58. 285 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw
  59. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw
  60. 470 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/drvapi_error_string.h
  61. 160 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/dynlink_d3d11.h
  62. 151 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/exception.h
  63. 967 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cuda.h
  64. 405 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cuda_drvapi.h
  65. 166 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cusolver.h
  66. 59 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_functions.h
  67. 267 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_gl.h
  68. 1001 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_image.h
  69. 1469 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_math.h
  70. 543 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.cpp
  71. 120 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.h
  72. 428 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_nvJPEG.hxx
  73. 368 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_string.h
  74. 465 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_timer.h
  75. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib
  76. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib
  77. 200 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/nvrtc_helper.h
  78. 124 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.cpp
  79. 52 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.h
  80. 337 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile
  81. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest
  82. 695 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
  83. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.o
  84. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi
  85. 72 134
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu
  86. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.qdrep
  87. BIN
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.sqlite
  88. 13 0
      hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/temp
  89. 9 14
      hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb
  90. 0 57
      hpc/multi_gpu_nways/slurm-165592.out
  91. 72 0
      hpc/multi_gpu_nways/slurm-171483.out

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/bin/x86_64/linux/release/p2pBandwidthLatencyTest


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/domain_decomposition.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/gpu_programming_process.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/halo_exchange.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/intra_node_topology_map.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_host_staging.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_p2p_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpy_serialized.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/memcpyasync_parallel.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_cli_sample_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_overview.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nsys_single_gpu_analysis.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_p2p_gpu0.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/nvidia_smi_topo_output.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/open_terminal_session.png


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png


+ 569 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/memcpy/streams.ipynb

@@ -0,0 +1,569 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dd0ae66a",
+   "metadata": {},
+   "source": [
+    "Before we begin, let's get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7d483e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e4ddba18",
+   "metadata": {},
+   "source": [
+    "# Learning Objectives\n",
+    "\n",
+    "In this tutorial, the goal is to:\n",
+    "* Parallelize the single-GPU code using CUDA Memcpy and streams\n",
+    "* Understand intra-node topology and underlying technologies like GPUDirect P2P and their implication on program performance\n",
+    "\n",
+    "# Multi-GPU Programming\n",
+    "\n",
+    "In this section we first cover the principle behind decomposing data among the GPUs, known as domain decomposition. Then, we understand and implement the baseline multi-GPU code using `cudaSetDevice` and `cudaMemcpy` functions. \n",
+    "\n",
+    "### Domain Decomposition\n",
+    "\n",
+    "Before we begin, we define two important terms:\n",
+    "\n",
+    "* **Latency:** The amount of time it takes to take a unit of data from point A to point B. For example, if 4B of data can be transferred from point A to B in 4 $\\mu$s, that is the latency of transfer.\n",
+    "* **Bandwidth:** The amount of data that can be transferred from point A to point B in a unit of time. For example, if the width of the bus is 64KiB and latency of transfer between point A and B is 4 $\\mu$s, the bandwidth is 64KiB * (1/4$\\mu$s) = 1.6 GiB/s.\n",
+    "\n",
+    "To parallelize our application to multi-GPUs, we first review the different methods of domain decomposition available to us for splitting the data among the GPUs, thereby distributing the work. Broadly, we can divide data into either stripes or tiles.\n",
+    "\n",
+    "* **Stripes**: They minimize the number of neighbours, require communication among less neighbours, and are optimal for latency bound communication.\n",
+    "\n",
+    "* **Tiles**: They minimize surface area/ volume ratio of the grid, require communicating less data, and are optimal for bandwidth bound communication.\n",
+    "\n",
+    "![domain_decomposition](../../images/domain_decomposition.png)\n",
+    "\n",
+    "When we divide the global grid between GPUs, only the boundaries of each GPU-local grid need to be communicated with the neighboring GPUs, as they need the updated grid-point values for the next iteration. Therefore, we use horizontal stripes (as C/ C++ are row-major) in our tutorials for domain decomposition, enabling data parallelism.\n",
+    "\n",
+    "### Halo Exchange\n",
+    "\n",
+    "We term the exchange of top and bottom rows after each iterations the \"halo exchange\". Review the image below and notice that we update the topmost and bottomost rows of the grid to implement the periodic boundary condition. Recall that the left and right columns of the grid constitute Dirichlet boundary conditions (that is, constant value).\n",
+    "\n",
+    "![halo_exchange](../../images/halo_exchange.png)\n",
+    "\n",
+    "## CUDA concepts: Part 1\n",
+    "\n",
+    "### Setting the GPU\n",
+    "\n",
+    "To verify that our system has multiple GPUs in each node, run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49697bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62d045bd",
+   "metadata": {},
+   "source": [
+    "The command should output more than one GPU. Inside a program, the number of GPU in the node can be obtained using the `cudaGetDeviceCount(int *count)` function and to perform any task, like running a CUDA kernel, copy operation, etc. on a particular GPU, we use the `cudaSetDevice(int device)` function.\n",
+    "\n",
+    "### Copying between GPUs\n",
+    "\n",
+    "The `cudaMemcpy` function supports GPU to GPU copy using the `cudaMemcpyDeviceToDevice` flag and the source and destination memory addresses should reside in GPU devices. \n",
+    "\n",
+    "For example, if we want to copy 1000 floats from the array `arr_gpu_0` allocated on GPU 0 to the array `arr_gpu_1`, the function call is:\n",
+    "\n",
+    "```c\n",
+    "cudaMemcpy(arr_gpu_1, arr_gpu_0, 1000 * sizeof(float), cudaMemcpyDeviceToDevice);\n",
+    "```\n",
+    "\n",
+    "Recall that CUDA kernel calls made from the host are non-blocking (asynchronous) by default. That is, the control may return back to the host thread before the device kernel finishes execution. To perform the halo exchange, we need to perform copy operations between each GPU and its neighbours. However, for large copy sizes, `cudaMemcpy` is blocking with respect to the host. \n",
+    "\n",
+    "Thus, we cannot use the following code snippet:\n",
+    "\n",
+    "```c\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Set current device\n",
+    "    cudaSetDevice(i);\n",
+    "    // Define row number of top and bottom neighbours, etc.\n",
+    "    TopNeighbour = ...; BotNeighbour = ...; // and so-on\n",
+    "    // Launch device kernel on GPU i\n",
+    "    jacobi_kernel<<<dim_grid, dim_block>>>(...);\n",
+    "    // Halo exchange\n",
+    "    cudaMemcpy(grid_rows[TopNeighbour], grid_rows[myTop], size, cudaMemcpyDeviceToDevice);\n",
+    "    cudaMemcpy(grid_rows[BotNeighbour], grid_rows[myBot], size, cudaMemcpyDeviceToDevice);\n",
+    "    // Norm check, swapping current and previous grid arrays, etc.\n",
+    "} // Serializes operations with respect to the host\n",
+    "```\n",
+    "\n",
+    "As this code results in serialized execution:\n",
+    "\n",
+    "![memcpy_serialized](../../images/memcpy_serialized.png)\n",
+    "\n",
+    "### Asynchronous operations\n",
+    "\n",
+    "Instead of `cudaMemcpy`, we can use the `cudaMemcpyAsync` function which is asynchronous with respect to the host. This allows the host to launch device kernels and copy operations concurrently, enabling parallel execution across GPUs. \n",
+    "\n",
+    "The correct code snippet is as follows:\n",
+    "\n",
+    "```c\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Set current device\n",
+    "    cudaSetDevice(i);\n",
+    "    // Launch device kernel on GPU i\n",
+    "    jacobi_kernel<<<dim_grid, dim_block>>>(...);\n",
+    "}\n",
+    "for (int i = 0; i < 2; i++) {\n",
+    "    // Define row number of top and bottom neighbours, etc.\n",
+    "    TopNeighbour = ...; BotNeighbour = ...; // and so-on\n",
+    "    // Halo exchange, notice the use of Async function\n",
+    "    cudaMemcpyAsync(grid_rows[TopNeighbour], grid_rows[myTop], size, cudaMemcpyDeviceToDevice);\n",
+    "    cudaMemcpyAsync(grid_rows[BotNeighbour], grid_rows[myBot], size, cudaMemcpyDeviceToDevice);\n",
+    "    // Norm check, swapping current and previous grid arrays, etc.\n",
+    "} // Parallel execution across multiple GPUs\n",
+    "```\n",
+    "\n",
+    "And the execution time of the application is reduced:\n",
+    "\n",
+    "![memcpyasync_parallel](../../images/memcpyasync_parallel.png)\n",
+    "\n",
+    "## Implementation exercise: Part 1\n",
+    "\n",
+    "Now, let's parallelize our code across multiple GPUs by using `cudaSetDevice` and `cudaMemcpyAsync` operations. Open the [jacobi_memcpy.cu](../../source_code/memcpy/jacobi_memcpy.cu) file by using the `File` $\\rightarrow$ `Open...` option.\n",
+    "\n",
+    "Understand the flow of the program from within the `main` function. Review the following pre-Jacobi-computation steps:\n",
+    "\n",
+    "1. Computation of the memory chunk size to be allocated on each GPU stored in the `chunk_size` integer array.\n",
+    "2. Allocation of memory on each GPU: Notice the use of array pointers like `a_new`, `l2_norm_d`, `iy_start`, etc. that point to device arrays allocated on GPU pointed to by `dev_id` variable.\n",
+    "3. Initialization of Dirichlet boundary conditions on left and right boundaries.\n",
+    "4. Share of initial top and bottom local grid-point values between neighbours.\n",
+    "\n",
+    "\n",
+    "Now, within the iterative Jacobi loop (the `while` loop), implement the following marked as `TODO: Part 1-`:\n",
+    "\n",
+    "1. Set current GPU and call device kernel with correct device arrays in function arguments.\n",
+    "2. Asynchronously copy GPU-local L2 norm back to CPU and implement top and bottom halo exchanges.\n",
+    "3. Synchronize the devices at the end of each iteration using `cudaDeviceSynchronize` function.\n",
+    "\n",
+    "Review the topic on Asynchronous Operations above if in doubt. Recall the utility of using separate `for` loops for launching device kernels and initiating copy operations.\n",
+    "\n",
+    "After implementing these, let's compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce6dc6ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy && make clean && make jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "698ab130",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compiler warnings or errors. Validate the implementation by running the binary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50debc4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy && ./jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e276f70",
+   "metadata": {},
+   "source": [
+    "The last couple of lines of the output will give the number and IDs of GPUs used, execution timings, speedup, and efficiency metrics. Review Metrics of Interest section in [single GPU overview](../single_gpu/single_gpu_overview.ipynb) tutorial for more information). We tested the code on a DGX-1 system with 8 Tesla V100 16GB GPUs, and we got the following output:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8. Using GPU ID: 0, 1, 2, 3, 4, 5, 6, 7, \n",
+    "16384x16384: 1 GPU:   5.0272 s, 8 GPUs:   1.1376 s, speedup:     4.42, efficiency:    55.24\n",
+    "```\n",
+    "\n",
+    "Notice that we got a speed-up of $4.42\\times$ using 8 GPUs and a corresponding efficiency of $55.24\\%$. The numbers will vary depending on number of available GPUs in your system, the communication topology, GPU type, etc.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "Now, profile the execution with `nsys`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3187cdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_memcpy_sys_report --force-overwrite true ./jacobi_memcpy -gpus 0,7"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4ac727d",
+   "metadata": {},
+   "source": [
+    "In the profiler timeline, the first few seconds denote the single-GPU code running on one of the GPUs. This version is executed so we can compare the multi-GPU version with it and we have already analyzed it. Let's analyze the multi-GPU timeline:\n",
+    "\n",
+    "IMAGE LINK HERE\n",
+    "\n",
+    "NSYS DESCRIPTION HERE\n",
+    "\n",
+    "The solution for this exercise is present in `source_code/memcpy/solution` directory: [jacobi_memcpy.cu](../../source_code/memcpy/solution/jacobi_memcpy.cu)\n",
+    "\n",
+    "## CUDA concepts: Part 2\n",
+    "\n",
+    "### Host Staging of Copy Operations\n",
+    "\n",
+    "Using `cudaMemcpyAsync` instead of `cudaMemcpy` allows us to issue copy and compute operations on multiple GPUs concurrently. The path taken by the data in both the cases is denoted by the red arrow as follows:\n",
+    "\n",
+    "![memcpy_host_staging](../../images/memcpy_host_staging.png)\n",
+    "\n",
+    "That is, in the GPU-to-GPU memory copy, the data traverses from GPU 0 the PCIe bus to the CPU, where it is staged in a buffer before being copied to GPU 1. This is called \"host staging\" and it decreases the bandwidth while increasing the latency of the operation. If we eliminate host staging, we can usually improve the performance of our application.\n",
+    "\n",
+    "### Peer-to-Peer Memory Access\n",
+    "\n",
+    "P2P allows devices to address each other's memory from within device kernels and eliminates host staging by transferring data either through the PCIe switch or through NVLink as denoted by the red arrow below. \n",
+    "\n",
+    "![memcpy_p2p_overview](../../images/memcpy_p2p_overview.png)\n",
+    "\n",
+    "Peer-to-Peer (P2P) memory access requires GPUs to share a Unified Virtual Address Space (UVA). UVA means that a single address space is used for the host and all modern NVIDIA GPU devices (specifically, those with compute capibility of 2.0 or higher).\n",
+    "\n",
+    "This P2P memory access feature is supported between two devices if `cudaDeviceCanAccessPeer()` returns true for these two devices. P2P must be enabled between two devices by calling `cudaDeviceEnablePeerAccess()` as illustrated in the following code sample:\n",
+    "\n",
+    "```c\n",
+    "cudaSetDevice(currDevice);\n",
+    "int canAccessPeer = 0;\n",
+    "cudaDeviceCanAccessPeer(&canAccessPeer, currDevice, PeerDevice);\n",
+    "if (canAccessPeer) {\n",
+    "    cudaDeviceEnablePeerAccess(PeerDevice, 0);\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Note that this enables a unidirectional P2P access where `currDevice` can perform memory access to `PeerDevice`. If we want `PeerDevice` to be able to access `currDevice` via P2P, then we need to use the code accordingly.\n",
+    "\n",
+    "First, let's check if P2P is supported between the GPUs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f757d16c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -p2p r"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7afbc209",
+   "metadata": {},
+   "source": [
+    "The `topo` sub-command requests information on the GPU communication topology, `-p2p` flag requests P2P status, and `r` asks whether P2P reads are supported. Change `r` to `w` to check whether writes are supported. We share our output on a DGX-1 system with 8 Tesla V100s, focusing on the capabilities of GPU 0:\n",
+    "\n",
+    "![nvidia_smi_p2p_gpu0](../../images/nvidia_smi_p2p_gpu0.png)\n",
+    "\n",
+    "This means GPU 0 can communicate via P2P with GPUs 1 through 4. For GPUs 5 through 7, it must use host staging.\n",
+    "\n",
+    "To check whether P2P via NVLink is supported, run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1250c02c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -p2p n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d84934b",
+   "metadata": {},
+   "source": [
+    "In our DGX-1 system, the result is similar as before. Even if P2P via NVLink is not supported on your system, as long as `-p2p r` and `-p2p w` are supported between GPUs, P2P capability is available.\n",
+    "\n",
+    "## Implementation Exercise: Part 2\n",
+    "\n",
+    "Now, let us improve our program performance by enabling P2P access between GPUs, wherever possible. The `jacobi_memcpy.cu` code accepts a runtime argument `-p2p` which should enable P2P access between GPUs. \n",
+    "\n",
+    "Modify the code by searching for `TODO: Part 2` and enabling GPU `devices[dev_id]` to access peer GPUs `devices[top]` and `devices[bottom]`, whenever possible. \n",
+    "\n",
+    "Notice that the code snippet is within a `for` loop which sets and iterates over each GPU, which is why bidirectional P2P will be enabled. Take help from the code sample in the previous section.\n",
+    "\n",
+    "Now, let's compile the code again:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90e8da79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy && make clean && make jacobi_memcpy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd648c93",
+   "metadata": {},
+   "source": [
+    "Ensure there are no compiler warnings or errors. Validate the implementation by running the binary with P2P enabled:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed251978",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy && ./jacobi_memcpy -p2p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1acc2cc0",
+   "metadata": {},
+   "source": [
+    "The output we got on our DGX-1 system is:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 8. Using GPU ID: 0, 1, 2, 3, 4, 5, 6, 7, \n",
+    "16384x16384: 1 GPU:   4.4487 s, 8 GPUs:   0.8798 s, speedup:     5.06, efficiency:    63.21 \n",
+    "```\n",
+    "\n",
+    "Notice that the efficiency increased by about $8\\%$ to $63.21\\%$ compared to our baseline implementation. You can run the baseline again by removing the `-p2p` flag. Note that if P2P is not supported on your system, you will likely not experience any performance gain.\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "IMAGE LINK HERE\n",
+    "\n",
+    "NSYS DESCRIPTION HERE\n",
+    "\n",
+    "## Intra-Node Communication Topology\n",
+    "\n",
+    "Run the command below to display your node's GPU and NIC communication topology:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5be59a7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi topo -m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a81fa29e",
+   "metadata": {},
+   "source": [
+    "If the output is unclear, you can launch a Terminal session by clicking on `File` $\\rightarrow$ Open and following the steps as shown:\n",
+    "\n",
+    "![open_terminal_session](../../images/open_terminal_session.png)\n",
+    "\n",
+    "On our DGX-1 system, the output is as follows:\n",
+    "\n",
+    "![nvidia_smi_topo_output](../../images/nvidia_smi_topo_output.png)\n",
+    "\n",
+    "Focus one a particular row, say GPU 0. The output states that GPUs 1 through 4 are connected to it via NVLink (in addition to PCIe) and GPUs 5 through 7 are connected to it via PCIe as well as an \"SMP\" interconnect. We have a dual-socket system and the CPUs in these sockets are connected by an interconnect known as SMP interconnect.\n",
+    "\n",
+    "Thus, GPU 0 to GPU 5 communication happens via not just PCIe, but also over the inter-socket interconnect within the same node. Clearly, this is a longer path than say the one between GPU 0 and GPU 1, which are connected via NVLink directly. We will discuss the NIC to GPU connection in the inter-node section of this bootcamp.\n",
+    "\n",
+    "Even within the GPUs connected via NVLink, we see different annotations such as `NV1` and `NV2` that affect the communication bandwidth and hence the performance. In this section, we will explore the nuances associated with a diverse intra-node GPU communication topology like in the output above. Specifically, in our system, the communication topology is as follows:\n",
+    "\n",
+    "![dgx1_8x_tesla_v100_topo](../../images/dgx1_8x_tesla_v100_topo.png)\n",
+    "\n",
+    "Qualitatively, the bandwidth and latency vary with the topology as follows:\n",
+    "\n",
+    "![intra_node_topology_map](../../images/intra_node_topology_map.png)\n",
+    "\n",
+    "Host staging implies traversing through the CPU and the travel path taken is one of PHB, NODE, and SYS. In contrast, if the path taken is either NV1, NV2, or PIX, then P2P is available. PXB implies that the GPUs belong to different PCIe hubs and P2P is usually not supported in this case.\n",
+    "\n",
+    "A double NVLink connection provides twice the bandwidth compared to a single NVLink. \n",
+    "\n",
+    "For a pair of 2 GPUs, the peak bidirectional bandwidth are as follows:\n",
+    "* PCIe: Using PIX topology, 15.75GB/s for PCIe Gen 3.0 and 31.5GB/s for PCIe Gen 4.0.\n",
+    "* NVLink: Using NV# topology, 50GB/s per connection. So a double NVLink connection has 100GB/s peak bidirectional bandwidth.\n",
+    "\n",
+    "Let us understand what difference the underlying communication topology can make to the application performance in the following sub-section.\n",
+    "\n",
+    "**Note:** If your command output doesn't show any NVLink connection or if there's no difference in connection type (PIX, PXB, PHB, NODE, SYS, NV#) between any 2 pair of GPUs, then the communication bandwidth and latency will likely be the same between any pair and the following sub-sections will not display any performance difference.\n",
+    "\n",
+    "### Performance variation due to system topology\n",
+    "\n",
+    "So far, the code runs the multi-GPU version on all available GPUs in a node (8 in our case). We can supply the `-gpus` runtime flag to the binary to run our code on specific GPUs. If we want to run on only 2 GPUs, namely GPU 0 and GPU 3, we use the `-gpus 0,3` argument. \n",
+    "\n",
+    "Try to find the GPU pair with highest bandwidth available as per the table above and replace `0,3` with those GPUs, and then run the command below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccd50a44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/memcpy && ./jacobi_memcpy -p2p -gpus 0,7"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c4eb06",
+   "metadata": {},
+   "source": [
+    "The efficiency would likely be higher than before due to less inter-GPU communication (each GPU does more wok instead). Our output is as follows:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 3, \n",
+    "16384x16384: 1 GPU:   4.4513 s, 2 GPUs:   2.2664 s, speedup:     1.96, efficiency:    98.20  \n",
+    "```\n",
+    "\n",
+    "Now, run the binary a pair of GPUs that have the lowest available bandwidth. In our case, we use GPU 0 and GPU 7. Our output is:\n",
+    "\n",
+    "```bash\n",
+    "Num GPUs: 2. Using GPU ID: 0, 7, \n",
+    "16384x16384: 1 GPU:   4.4529 s, 2 GPUs:   2.3454 s, speedup:     1.90, efficiency:    94.93  \n",
+    "```\n",
+    "\n",
+    "Now remove the `-p2p` flag and run the command again for GPUs 0 and 7. We didn't get any difference in performance. As you may recall, P2P is not possible between GPUs 0 and 7, so the underlying communication path doesn't change, resulting in same performance with and without the `-p2p` flag. The same can be confirmed by profiling the application and looking at the operations performed in the Nsight Systems timeline. \n",
+    "\n",
+    "![p2p_2_gpu_memcpy_nsys](../../images/p2p_2_gpu_memcpy_nsys.png)\n",
+    "\n",
+    "Try a few other GPU combinations and toggle P2P so see if the performance variation correlates with the table above. Also try reducing the grid size using `-nx` and `-ny` flags (to say 8192$\\times$8192) and see the effect on efficiency. \n",
+    "\n",
+    "### Benchmarking the system topology\n",
+    "\n",
+    "Our application is not very memory intensive. As is visible from the profiler output, $\\gt95\\%$ of the time in GPU is spent on computation. Therefore, to get a quantitative measure of latency and bandwidth impact due to topology, we run a micro-benchmark.\n",
+    "\n",
+    "**The p2pBandwidthLatencyTest micro-benchmark**\n",
+    "\n",
+    "p2pBandwidthLatencyTest is a part of [CUDA Samples GitHub repository](https://github.com/NVIDIA/cuda-samples) available to help CUDA developers. \n",
+    "\n",
+    "As the name suggests, this test measures the bandwidth and latency impact of P2P and underlying communication topology. Let's compile the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93fa162c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "429bc0cf",
+   "metadata": {},
+   "source": [
+    "Now, let's run the benchmark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f607f88d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/p2pBandwidthLatencyTest/ && ./p2pBandwidthLatencyTest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dacdaacc",
+   "metadata": {},
+   "source": [
+    "The first part of the benchmark gives device information and P2P access available from each GPU (similar to `nvidia-smi topo -m` command). Next, the benchmark measures the unidirectional and bidirectional bandwidth and latency with P2P disabled and enabled.\n",
+    "\n",
+    "We share partial results obtained in our DGX-1 system:\n",
+    "\n",
+    "```bash\n",
+    "Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 783.95   9.56  14.43  14.46  14.47  14.24  14.51  14.43 \n",
+    "\n",
+    "Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n",
+    "   D\\D     0      1      2      3      4      5      6      7 \n",
+    "     0 784.87  48.49  48.49  96.85  96.90  14.25  14.54  14.49 \n",
+    "     \n",
+    "P2P=Disabled Latency Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.78  17.52  16.41  16.43  17.35  16.88  17.34  16.85 \n",
+    "     \n",
+    "P2P=Enabled Latency (P2P Writes) Matrix (us)\n",
+    "   GPU     0      1      2      3      4      5      6      7 \n",
+    "     0   1.76   1.62   1.61   2.01   2.02  18.44  19.15  19.34\n",
+    "```\n",
+    "\n",
+    "Our system is based on PCIe gen 3.0 with a peak maximum GPU-GPU PCIe banwidth of 15.75 GB/s. Let us analyze and understand these results:\n",
+    "\n",
+    "* GPU 0 and GPU 1/2: Connected by a single NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 50 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 3/4: Connected by a double NVLink connection. By enabling P2P-\n",
+    "  - Bandwidth reaches close to the maximum peak of 100 GB/s.\n",
+    "  - Latency decreases by an order of magnitude.\n",
+    "* GPU 0 and GPU 5/6/7: Connected by PCIe and SMP interconnect. By enabling P2P- \n",
+    "  - Bandwidth is unchanged.\n",
+    "  - Latency increases a marginally.\n",
+    "  \n",
+    "Correlate these results with the communication topology that can be displayed by usng `nvidia-smi topo -m` command and the qualtitative table in the previous section. They should be consistent with one another.\n",
+    "\n",
+    "In general, we should try to set the GPUs in an application such that a GPU can share data with its neighbours using a high-bandwidth, low-latency communication topology. Enabling P2P, when possible, usually improves the performance by eliminating host staging."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 243 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb

@@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "39ad569e",
+   "metadata": {},
+   "source": [
+    "# The Application\n",
+    "\n",
+    "This section provides an overview of the scientific problem we focus on and the solver we employ. Then, we execute the single GPU version of the application program.\n",
+    "\n",
+    "### Laplace Equation\n",
+    "\n",
+    "Laplace Equation is a well-studied linear partial differential equation that governs steady state heat conduction, irrotational fluid flow, and many other phenomena. \n",
+    "\n",
+    "In this lab, we will consider the 2D Laplace Equation on a rectangle with Dirichlet boundary conditions on the left and right boundary and periodic boundary conditions on top and bottom boundary. We wish to solve the following equation:\n",
+    "\n",
+    "$\\Delta u(x,y) = 0\\;\\forall\\;(x,y)\\in\\Omega,\\delta\\Omega$\n",
+    "\n",
+    "### Jacobi Method\n",
+    "\n",
+    "The Jacobi method is an iterative algorithm to solve a linear system of strictly diagonally dominant equations. The governing Laplace equation is discretized and converted to a matrix amenable to Jacobi-method based solver.\n",
+    "\n",
+    "### The Code\n",
+    "\n",
+    "The GPU processing flow follows 3 key steps:\n",
+    "\n",
+    "1. Copy data from CPU to GPU\n",
+    "2. Launch GPU Kernel\n",
+    "3. Copy processed data back to CPU from GPU\n",
+    "\n",
+    "![gpu_programming_process](../../images/gpu_programming_process.png)\n",
+    "\n",
+    "Let's understand the single-GPU code first. The source code file is available here: [jacobi.cu](../../source_code/single_gpu/jacobi.cu).\n",
+    "\n",
+    "Alternatively, you can open the `File` menu and click on the `Open...` option which opens Jupyter's file explorer in a new tab. Then, navigate to `CFD/English/C/source_code/single_gpu/` directory in which you can view the `jacobi.cu` file. \n",
+    "\n",
+    "Similarly, have look at the [Makefile](../../source_code/single_gpu/Makefile). \n",
+    "\n",
+    "Refer to the `single_gpu(...)` function. The important steps at each iteration of the Jacobi Solver (that is, the `while` loop) are:\n",
+    "1. The norm is set to 0 using `cudaMemset`.\n",
+    "2. The device kernel `jacobi_kernel` is called to update the interier points.\n",
+    "3. The norm is copied back to the host using `cudaMemcpy` (DtoH), and\n",
+    "4. The periodic boundary conditions are re-applied for the next iteration using `cudaMemcpy` (DtoD).\n",
+    "\n",
+    "Note that we run the Jacobi solver for 1000 iterations over the grid.\n",
+    "\n",
+    "### Compilation and Execution\n",
+    "\n",
+    "Let's first get an overview of the CUDA driver version and the GPUs running on the server by executing the `nvidia-smi` command below. Highlight the cell below by clicking on it and then either hit `Ctrl+Enter` on the keyboard or click on the `Run` button on the toolbar above. The output will be visible below the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abb46488",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f97f825b",
+   "metadata": {},
+   "source": [
+    "We will now compile the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eac2daf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33345661",
+   "metadata": {},
+   "source": [
+    "Now, let us execute the program: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e234f430",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu && ./jacobi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14bb863e",
+   "metadata": {},
+   "source": [
+    "The output reports the norm value every 100 iterations and the total execution time of the Jacobi Solver. The expected output is:\n",
+    "\n",
+    "```\n",
+    "Single GPU jacobi relaxation: 1000 iterations on 16384 x 16384 mesh\n",
+    "    0, 31.999022\n",
+    "  100, 0.897983\n",
+    "  200, 0.535684\n",
+    "  300, 0.395651\n",
+    "  400, 0.319039\n",
+    "  500, 0.269961\n",
+    "  600, 0.235509\n",
+    "  700, 0.209829\n",
+    "  800, 0.189854\n",
+    "  900, 0.173818\n",
+    "16384x16384: 1 GPU:   4.4512 s\n",
+    "```\n",
+    "\n",
+    "The execution time may differ depending on the GPU, but the norm value after every 100 iterations should be the same. The program accepts `-nx` and `-ny` flags to change the grid size (preferably a power of 2) and `-niter` flag to change the number of iterations.\n",
+    "\n",
+    "\n",
+    "# Profiling\n",
+    "\n",
+    "While the program in our labs gives the execution time in its output, it may not always be convinient to time the execution from within the program. Moreover, just timing the execution does not reveal the bottlenecks directly. For that purpose, we profile the program with NVIDIA's NSight Systems profiler's command-line interface (CLI), `nsys`. \n",
+    "\n",
+    "### NVIDIA Nsight Systems\n",
+    "\n",
+    "Nsight Systems profiler offers system-wide performance analysis in order to visualize application’s execution timeline and help identify optimization opportunities on a system with multiple CPUs and GPUs.\n",
+    "\n",
+    "#### Timeline\n",
+    "\n",
+    "![Nsight Systems timeline](../../images/nsys_overview.png)\n",
+    "\n",
+    "The highlighted portions are identified as follows:\n",
+    "* <span style=\"color:red\">Red</span>: The CPU tab provides thread-level core utilization data. \n",
+    "* <span style=\"color:blue\">Blue</span>: The CUDA HW tab displays GPU kernel and memory transfer activities and API calls.\n",
+    "* <span style=\"color:orange\">Orange</span>: The Threads tab gives a detailed view of each CPU thread's activity including from OS runtime libraries, MPI, NVTX, etc.\n",
+    "\n",
+    "#### `nsys` CLI\n",
+    "\n",
+    "We will profile the application using `nsys` CLI. Here's a typical `nsys` command to profile a program:\n",
+    "\n",
+    "`nsys profile --trace=cuda,nvtx --stats=true -o jacobi_report --force-overwrite true ./jacobi`\n",
+    "\n",
+    "The `--trace` flag specifies that we want to trace CUDA and NVTX APIs (in addition to baseline tracing), `--stats` specifies that we want to generate a statistics summary after profiling, and `-o` allows us to name the report file (which will include the `.qdrep` extension). The `--force-overwrite` flag allows us to overwrite an existing report (of the same name).\n",
+    "\n",
+    "Note that we can always use the `nsys --help` to know more about these and other available options.\n",
+    "\n",
+    "### Viewing the Report\n",
+    "\n",
+    "One can view the profiling report by using Nsight Systems GUI. Note that CUDA toolkit and the GUI application of the same version as CLI are required. Follow these steps:\n",
+    "* Open Nsight Systems GUI application.\n",
+    "* Click on _file $\\rightarrow$ open_.\n",
+    "* Browse and select the `.qdrep` file.\n",
+    "\n",
+    "Alternatively, we can enable the `--stats` flag to display profiling data on the terminal (refer to the image below).\n",
+    "\n",
+    "![nsys cli sample output](../../images/nsys_cli_sample_output.png)\n",
+    "\n",
+    "### NVIDIA Tools Extension (NVTX)\n",
+    "\n",
+    "NVTX is C-based API for annotating events in applications. It is useful for profiling both specific events and large code blocks. We will routinely make use of NVTX APIs to instrument our application for `nsys` profiler. It helps `nsys` in collecting relevant information and improves the application timeline's readability. \n",
+    "\n",
+    "To use NVTX, follow these steps:\n",
+    "* `#include <nvToolsExt.h>` in the code file\n",
+    "* Insert `nvtxRangePush(\"myCodeBlock\");` just before the code block begins and `nvtxRangePop();` just after it ends.\n",
+    "\n",
+    "Now, go back to the [jacobi.cu](../../source_code/single_gpu/jacobi.cu) source code file and correlate the \"Jacobi solve\" annotated event visible on both the `nsys` CLI statistics and the GUI-based timeline to its use in the source code.\n",
+    "\n",
+    "### Improving performance\n",
+    "\n",
+    "Any code snippet can be taken up for optimizations. However, it is important to realize that our current code is limited to a single GPU. Usually a very powerful first optimization is to parallelize the code, which in our case means running it on multiple GPUs. Thus, we generally follow the cyclical process:\n",
+    "\n",
+    "* **Analyze** the code using profilers to identify bottlenecks and hotspots.\n",
+    "* **Parallelize** the routines where most of the time in the code is spent.\n",
+    "* **Optimize** the parallel code by analyzing first for opportunities, applying optimizations, verifying our gains, and repeating the process.\n",
+    "\n",
+    "### Metrics of Interest\n",
+    "\n",
+    "To quantify the performance gain, we denote the single-GPU execution time as $T_s$ and multi-GPU execution time for $P$ GPUs as $T_p$. Using this, we obtain the figures-of-merit:\n",
+    "* Speedup $S = T_s/T_p$ (optimal is $P$), and \n",
+    "* Efficiency $E = S/P$ (optimal is $1$). \n",
+    "\n",
+    "### Analyzing the code\n",
+    "\n",
+    "Let's profile the single-GPU code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a9a8109",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/single_gpu/ && nsys profile --trace=cuda,nvtx --stats=true -o jacobi_report --force-overwrite true ./jacobi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6db3c3c7",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "Now, download the report and view it via the GUI. This is the analysis step. Right click on the NVTX tab and select the Events View.\n",
+    "\n",
+    "![nsys single_gpu_analysis](../../images/nsys_single_gpu_analysis.png)\n",
+    "\n",
+    "Clearly, we need to parallelize the \"Jacobi Solve\" routine, which is essentially the iterative Jacobi solver loop. Click on the link to continue to the next lab where we parallelize the code using cudaMemcpy and CUDA streams:\n",
+    "\n",
+    "# [Multi-GPU: CUDA Streams](../memcpy/streams.ipynb)\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 9 - 22
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/Makefile

@@ -1,32 +1,19 @@
 # Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 NVCC=nvcc
-GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
-GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
-GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
-GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
-GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
-GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
 GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
 GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
 GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
-ifdef DISABLE_CUB
-        NVCC_FLAGS = -Xptxas --optimize-float-atomics
-else
-        NVCC_FLAGS = -DHAVE_CUB
-endif
 NVCC_FLAGS += -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14
-jacobi: Makefile jacobi.cu
-	$(NVCC) $(NVCC_FLAGS) jacobi.cu -o jacobi
 
-.PHONY.: clean
-clean:
-	rm -f jacobi jacobi.qdrep
+jacobi_memcpy: jacobi_memcpy.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_memcpy.cu -o jacobi_memcpy
+
+jacobi_streams: jacobi_streams.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi_streams.cu -o jacobi_streams
 
-sanitize: jacobi
-	compute-sanitizer ./jacobi -niter 10
+all: jacobi_memcpy jacobi_streams
 
-run: jacobi
-	./jacobi
+.PHONY: clean
+clean:
+	rm -f jacobi_memcpy jacobi_streams *.qdrep *.sqlite
 
-profile: jacobi
-	nsys profile --trace=cuda,nvtx -o jacobi ./jacobi -niter 10

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy


+ 462 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy.cu

@@ -0,0 +1,462 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+int get_parsed_vals(char** begin, char **end, int* devices,
+		const std::string& arg, const int default_val) {
+    int numGPUs = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        numGPUs = 0;
+        std::string dev_ids(*itr);
+	int currpos = 0, nextpos = 0;
+	do {
+	    nextpos = dev_ids.find_first_of(",", currpos);
+            devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
+	    numGPUs++;
+	    currpos = nextpos + 1;
+        } while (nextpos != std::string::npos);
+    }
+    else {
+        for (int i = 0; i < numGPUs; i++) {
+            devices[i] = i;
+	}
+    }
+    return numGPUs;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+    
+    // Get GPU mapping from runtime arguments
+    int available_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
+    int devices[MAX_NUM_DEVICES];
+    int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    // Compute chunk size and allocate memory on GPUs
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+	    // Allocate memory on host and record single-GPU timings
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+    if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+	// Allocate memory on GPU
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+	    // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, devices[dev_id], devices[top]));
+            if (canAccessPeer) {
+		// TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top"
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[top], 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+		// TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
+		// "devices[bottom", whenever possible
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, 
+					devices[dev_id], devices[bottom]));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(devices[bottom], 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    // Share initial top and bottom local grid-point values between neighbours
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+	// Launch device kernel on each GPU
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+	    // TODO: Part 1- Set current GPU to be "devices[dev_id]"
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+            CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+	    
+	    // TODO: Part 1- call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
+	    // blocks per thread. "dev_id" variable points to corresponding memory allocated 
+	    // for the current GPU.
+            jacobi_kernel<<<dim_grid, dim_block>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+	}
+        // Launch async memory copy operations for halo exchange and 
+	// for copying local-grid L2 norm from each GPU to host
+	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+	    // TODO: Part 1- Set current GPU
+            CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+
+	    // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost));
+
+	    // TODO: Part 1- Implement halo exchange with top neighbour "top"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice));
+	    
+	    // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice));
+        }
+        l2_norm = 0.0;
+	// Synchronize devices and compute global L2 norm
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
+	    CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
+            CUDA_RT_CALL(cudaDeviceSynchronize());
+
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    // Copy computed grid back to host from each GPU
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    // Compare against single GPU execution for correctness
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d. Using GPU ID: ", num_devices);
+	for (int i = 0; i < num_devices; i++) {
+            printf("%d, ", devices[i]);
+	}
+        printf(
+	    "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+       // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+	CUDA_RT_CALL(cudaDeviceSynchronize());
+	l2_norm = *l2_norm_h;
+	l2_norm = std::sqrt(l2_norm);
+
+    	iter++;
+    	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_nvlink_report.sqlite


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_memcpy_sys_report.sqlite


+ 450 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/memcpy/jacobi_streams.cu

@@ -0,0 +1,450 @@
+/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <omp.h>
+#include <nvToolsExt.h>
+
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+
+constexpr int MAX_NUM_DEVICES = 32;
+
+constexpr float tol = 1.0e-8;
+
+const float PI = 2.0 * std::asin(1.0);
+
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+                    const int nx, const int my_ny, const int ny) {
+    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        a[iy * nx + 0] = y0;
+        a[iy * nx + (nx - 1)] = y0;
+        a_new[iy * nx + 0] = y0;
+        a_new[iy * nx + (nx - 1)] = y0;
+    }
+}
+
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
+    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
+    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
+
+    if (iy < iy_end && ix < (nx - 1)) {
+    // Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
+        a_new[iy * nx + ix] = new_val;
+    float residue = new_val - a[iy * nx + ix];
+    // Set block-level L2 norm value for this grid point
+    block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+    block_l2_sum[thread_index] = 0;
+    }
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+    __syncthreads();
+    if ((thread_index) % (2*stride) == 0) {
+            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+    }
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+    atomicAdd(l2_norm, block_l2_sum[0]);
+    }
+}
+
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
+    char** itr = std::find(begin, end, arg);
+    if (itr != end && ++itr != end) {
+        std::istringstream inbuf(*itr);
+        inbuf >> argval;
+    }
+    return argval;
+}
+
+bool get_arg(char** begin, char** end, const std::string& arg) {
+    char** itr = std::find(begin, end, arg);
+    if (itr != end) {
+        return true;
+    }
+    return false;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
+
+int main(int argc, char* argv[]) {
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
+    const bool p2p = get_arg(argv, argv + argc, "-p2p");
+
+    float* a[MAX_NUM_DEVICES];
+    float* a_new[MAX_NUM_DEVICES];
+    float* a_ref_h;
+    float* a_h;
+    double runtime_serial = 0.0;
+
+    cudaStream_t compute_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_top_stream[MAX_NUM_DEVICES];
+    cudaStream_t push_bottom_stream[MAX_NUM_DEVICES];
+    cudaEvent_t compute_done[MAX_NUM_DEVICES];
+    cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
+    cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
+
+    float* l2_norm_d[MAX_NUM_DEVICES];
+    float* l2_norm_h[MAX_NUM_DEVICES];
+
+    int iy_start[MAX_NUM_DEVICES];
+    int iy_end[MAX_NUM_DEVICES];
+
+    int chunk_size[MAX_NUM_DEVICES];
+
+    int num_devices = 0;
+    CUDA_RT_CALL(cudaGetDeviceCount(&num_devices));
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaFree(0));
+
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
+            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
+            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
+        }
+
+        // ny - 2 rows are distributed amongst `size` ranks in such a way
+        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
+        // This optimizes load balancing when (ny - 2) % size != 0
+        int chunk_size_low = (ny - 2) / num_devices;
+        int chunk_size_high = chunk_size_low + 1;
+
+        // To calculate the number of ranks that need to compute an extra row,
+        // the following formula is derived from this equation:
+        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
+        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  
+
+    if (dev_id < num_ranks_low)
+            chunk_size[dev_id] = chunk_size_low;
+        else
+            chunk_size[dev_id] = chunk_size_high;
+
+        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
+
+        // Calculate local domain boundaries
+        int iy_start_global;  // My start index in the global array
+        if (dev_id < num_ranks_low) {
+            iy_start_global = dev_id * chunk_size_low + 1;
+        } else {
+            iy_start_global =
+                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
+        }
+
+        iy_start[dev_id] = 1;
+        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
+
+        // Set dirichlet boundary conditions on left and right boarder
+        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
+            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+
+        CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
+        CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
+        CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(
+            cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming));
+        CUDA_RT_CALL(
+            cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming));
+
+        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
+        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
+
+        if (p2p == true) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            int canAccessPeer = 0;
+            CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
+            if (canAccessPeer) {
+                CUDA_RT_CALL(cudaDeviceEnablePeerAccess(top, 0));
+            }
+            const int bottom = (dev_id + 1) % num_devices;
+            if (top != bottom) {
+                canAccessPeer = 0;
+                CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, bottom));
+                if (canAccessPeer) {
+                    CUDA_RT_CALL(cudaDeviceEnablePeerAccess(bottom, 0));
+                }
+            }
+        }
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+        const int bottom = (dev_id + 1) % num_devices;
+        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
+                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi solve");
+    while (l2_norm > tol && iter < iter_max) {
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
+            const int bottom = (dev_id + 1) % num_devices;
+            CUDA_RT_CALL(cudaSetDevice(dev_id));
+
+            CUDA_RT_CALL(
+                cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
+
+            CUDA_RT_CALL(
+                cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
+            CUDA_RT_CALL(
+                cudaStreamWaitEvent(compute_stream[dev_id], push_bottom_done[(iter % 2)][top], 0));
+
+            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
+                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+
+            jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
+                    a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
+                    nx);
+            CUDA_RT_CALL(cudaGetLastError());
+            CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
+
+            CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
+                     cudaMemcpyDeviceToHost, compute_stream[dev_id]));
+
+            // Apply periodic boundary conditions
+            CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
+                                         a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
+                                         cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
+            CUDA_RT_CALL(
+                cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
+
+            CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
+            CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
+                                         nx * sizeof(float), cudaMemcpyDeviceToDevice,
+                                         push_bottom_stream[dev_id]));
+            CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
+                                         push_bottom_stream[dev_id]));
+        }
+        l2_norm = 0.0;
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
+            l2_norm += *(l2_norm_h[dev_id]);
+        }
+
+        l2_norm = std::sqrt(l2_norm);
+        
+	iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+            std::swap(a_new[dev_id], a[dev_id]);
+        }
+    }
+
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+    }
+
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    int offset = nx;
+    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
+        CUDA_RT_CALL(
+            cudaMemcpy(a_h + offset, a[dev_id] + nx,
+                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
+    }
+
+    bool result_correct = true;
+    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
+        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
+            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
+                fprintf(stderr,
+                        "ERROR: a[%d * %d + %d] = %f does not match %f "
+                        "(reference)\n",
+                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
+                result_correct = false;
+            }
+        }
+    }
+
+    if (result_correct) {
+        printf("Num GPUs: %d.\n", num_devices);
+        printf(
+            "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
+            "efficiency: %8.2f \n",
+            ny, nx, runtime_serial, num_devices, (stop - start),
+            runtime_serial / (stop - start),
+            runtime_serial / (num_devices * (stop - start)) * 100);
+    }
+
+    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
+        CUDA_RT_CALL(cudaSetDevice(dev_id));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[1][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_bottom_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(push_top_done[0][dev_id]));
+        CUDA_RT_CALL(cudaEventDestroy(compute_done[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(push_top_stream[dev_id]));
+        CUDA_RT_CALL(cudaStreamDestroy(compute_stream[dev_id]));
+
+        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
+        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
+
+        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
+        CUDA_RT_CALL(cudaFree(a[dev_id]));
+        if (0 == dev_id) {
+            CUDA_RT_CALL(cudaFreeHost(a_h));
+            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
+        }
+    }
+
+    return result_correct ? 0 : 1;
+}
+
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
+
+    float* l2_norm_d;
+    float* l2_norm_h;
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
+
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
+
+    // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
+    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
+    CUDA_RT_CALL(cudaGetLastError());
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
+
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
+
+    CUDA_RT_CALL(cudaDeviceSynchronize());
+
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
+
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+    int iter = 0;
+    float l2_norm = 1.0;
+
+    double start = omp_get_wtime();
+    nvtxRangePush("Jacobi Solve");
+    while (l2_norm > tol && iter < iter_max) {
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
+
+        // Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
+
+        // Apply periodic boundary conditions
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
+                                     cudaMemcpyDeviceToDevice));
+
+        CUDA_RT_CALL(cudaDeviceSynchronize());
+        l2_norm = *l2_norm_h;
+        l2_norm = std::sqrt(l2_norm);
+
+        iter++;
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
+    }
+    nvtxRangePop();
+    double stop = omp_get_wtime();
+
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
+
+    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
+    CUDA_RT_CALL(cudaFree(l2_norm_d));
+
+    CUDA_RT_CALL(cudaFree(a_new));
+    CUDA_RT_CALL(cudaFree(a));
+    return (stop - start);
+}
+

+ 22 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut.h

@@ -0,0 +1,22 @@
+#ifndef  __FREEGLUT_H__
+#define  __FREEGLUT_H__
+
+/*
+ * freeglut.h
+ *
+ * The freeglut library include file
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "freeglut_std.h"
+#include "freeglut_ext.h"
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_H__ */

+ 115 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_ext.h

@@ -0,0 +1,115 @@
+#ifndef  __FREEGLUT_EXT_H__
+#define  __FREEGLUT_EXT_H__
+
+/*
+ * freeglut_ext.h
+ *
+ * The non-GLUT-compatible extensions to the freeglut library include file
+ *
+ * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved.
+ * Written by Pawel W. Olszta, <olszta@sourceforge.net>
+ * Creation date: Thu Dec 2 1999
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * GLUT API Extension macro definitions -- behaviour when the user clicks on an "x" to close a window
+ */
+#define GLUT_ACTION_EXIT                         0
+#define GLUT_ACTION_GLUTMAINLOOP_RETURNS         1
+#define GLUT_ACTION_CONTINUE_EXECUTION           2
+
+/*
+ * Create a new rendering context when the user opens a new window?
+ */
+#define GLUT_CREATE_NEW_CONTEXT                  0
+#define GLUT_USE_CURRENT_CONTEXT                 1
+
+/*
+ * GLUT API Extension macro definitions -- the glutGet parameters
+ */
+#define  GLUT_ACTION_ON_WINDOW_CLOSE        0x01F9
+
+#define  GLUT_WINDOW_BORDER_WIDTH           0x01FA
+#define  GLUT_WINDOW_HEADER_HEIGHT          0x01FB
+
+#define  GLUT_VERSION                       0x01FC
+
+#define  GLUT_RENDERING_CONTEXT             0x01FD
+
+/*
+ * Process loop function, see freeglut_main.c
+ */
+FGAPI void    FGAPIENTRY glutMainLoopEvent(void);
+FGAPI void    FGAPIENTRY glutLeaveMainLoop(void);
+
+/*
+ * Window-specific callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutMouseWheelFunc(void (* callback)(int, int, int, int));
+FGAPI void    FGAPIENTRY glutCloseFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutWMCloseFunc(void (* callback)(void));
+/* A. Donev: Also a destruction callback for menus */
+FGAPI void    FGAPIENTRY glutMenuDestroyFunc(void (* callback)(void));
+
+/*
+ * State setting and retrieval functions, see freeglut_state.c
+ */
+FGAPI void    FGAPIENTRY glutSetOption(GLenum option_flag, int value) ;
+/* A.Donev: User-data manipulation */
+FGAPI void   *FGAPIENTRY glutGetWindowData(void);
+FGAPI void    FGAPIENTRY glutSetWindowData(void *data);
+FGAPI void   *FGAPIENTRY glutGetMenuData(void);
+FGAPI void    FGAPIENTRY glutSetMenuData(void *data);
+
+/*
+ * Font stuff, see freeglut_font.c
+ */
+FGAPI int     FGAPIENTRY glutBitmapHeight(void *font);
+FGAPI GLfloat FGAPIENTRY glutStrokeHeight(void *font);
+FGAPI void    FGAPIENTRY glutBitmapString(void *font, const unsigned char *string);
+FGAPI void    FGAPIENTRY glutStrokeString(void *font, const unsigned char *string);
+
+/*
+ * Geometry functions, see freeglut_geometry.c
+ */
+FGAPI void    FGAPIENTRY glutWireRhombicDodecahedron(void);
+FGAPI void    FGAPIENTRY glutSolidRhombicDodecahedron(void);
+FGAPI void    FGAPIENTRY glutWireSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
+FGAPI void    FGAPIENTRY glutSolidSierpinskiSponge(int num_levels, GLdouble offset[3], GLdouble scale) ;
+FGAPI void    FGAPIENTRY glutWireCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidCylinder(GLdouble radius, GLdouble height, GLint slices, GLint stacks);
+
+/*
+ * Extension functions, see freeglut_ext.c
+ */
+FGAPI void *FGAPIENTRY glutGetProcAddress(const char *procName);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_EXT_H__ */

+ 547 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/freeglut_std.h

@@ -0,0 +1,547 @@
+#ifndef  __FREEGLUT_STD_H__
+#define  __FREEGLUT_STD_H__
+
+/*
+ * freeglut_std.h
+ *
+ * The GLUT-compatible part of the freeglut library include file
+ *
+ * Copyright (c) 1999-2000 Pawel W. Olszta. All Rights Reserved.
+ * Written by Pawel W. Olszta, <olszta@sourceforge.net>
+ * Creation date: Thu Dec 2 1999
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * PAWEL W. OLSZTA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Under windows, we have to differentiate between static and dynamic libraries
+ */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#    include <windows.h>
+#    include <windowsx.h>
+#    include <mmsystem.h>
+#    define WINDOWS
+#ifdef FREEGLUT_STATIC
+#    define FGAPI
+#    define FGAPIENTRY
+
+#    pragma comment (lib, "freeglut_static.lib")    /* link with Win32 static freeglut lib */
+
+#else
+
+#        if defined(FREEGLUT_EXPORTS)
+#                define FGAPI __declspec(dllexport)
+/* #                define FGAPI */
+#        else
+#                define FGAPI __declspec(dllimport)
+#   pragma comment (lib, "freeglut.lib")    /* link with Win32 freeglut lib */
+#        endif
+#        define FGAPIENTRY __stdcall
+
+#endif
+
+#pragma comment (lib, "winmm.lib")       /* link with Windows MultiMedia lib */
+#pragma comment (lib, "user32.lib") /* link with Windows user lib */
+#pragma comment (lib, "gdi32.lib") /* link with Windows GDI lib */
+#pragma comment (lib, "opengl32.lib")    /* link with Microsoft OpenGL lib */
+#pragma comment (lib, "glu32.lib")       /* link with OpenGL Utility lib */
+
+
+#else
+#        define FGAPI
+#        define FGAPIENTRY
+#endif
+
+/*
+ * The freeglut and GLUT API versions
+ */
+#define  FREEGLUT             1
+#define  GLUT_API_VERSION     4
+#define  FREEGLUT_VERSION_2_0 1
+
+/*
+ * Always include OpenGL and GLU headers
+ */
+#include <GL/gl.h>
+#include <GL/glu.h>
+
+/*
+ * GLUT API macro definitions -- the special key codes:
+ */
+#define  GLUT_KEY_F1                        0x0001
+#define  GLUT_KEY_F2                        0x0002
+#define  GLUT_KEY_F3                        0x0003
+#define  GLUT_KEY_F4                        0x0004
+#define  GLUT_KEY_F5                        0x0005
+#define  GLUT_KEY_F6                        0x0006
+#define  GLUT_KEY_F7                        0x0007
+#define  GLUT_KEY_F8                        0x0008
+#define  GLUT_KEY_F9                        0x0009
+#define  GLUT_KEY_F10                       0x000A
+#define  GLUT_KEY_F11                       0x000B
+#define  GLUT_KEY_F12                       0x000C
+#define  GLUT_KEY_LEFT                      0x0064
+#define  GLUT_KEY_UP                        0x0065
+#define  GLUT_KEY_RIGHT                     0x0066
+#define  GLUT_KEY_DOWN                      0x0067
+#define  GLUT_KEY_PAGE_UP                   0x0068
+#define  GLUT_KEY_PAGE_DOWN                 0x0069
+#define  GLUT_KEY_HOME                      0x006A
+#define  GLUT_KEY_END                       0x006B
+#define  GLUT_KEY_INSERT                    0x006C
+
+/*
+ * GLUT API macro definitions -- mouse state definitions
+ */
+#define  GLUT_LEFT_BUTTON                   0x0000
+#define  GLUT_MIDDLE_BUTTON                 0x0001
+#define  GLUT_RIGHT_BUTTON                  0x0002
+#define  GLUT_DOWN                          0x0000
+#define  GLUT_UP                            0x0001
+#define  GLUT_LEFT                          0x0000
+#define  GLUT_ENTERED                       0x0001
+
+/*
+ * GLUT API macro definitions -- the display mode definitions
+ */
+#define  GLUT_RGB                           0x0000
+#define  GLUT_RGBA                          0x0000
+#define  GLUT_INDEX                         0x0001
+#define  GLUT_SINGLE                        0x0000
+#define  GLUT_DOUBLE                        0x0002
+#define  GLUT_ACCUM                         0x0004
+#define  GLUT_ALPHA                         0x0008
+#define  GLUT_DEPTH                         0x0010
+#define  GLUT_STENCIL                       0x0020
+#define  GLUT_MULTISAMPLE                   0x0080
+#define  GLUT_STEREO                        0x0100
+#define  GLUT_LUMINANCE                     0x0200
+
+/*
+ * GLUT API macro definitions -- windows and menu related definitions
+ */
+#define  GLUT_MENU_NOT_IN_USE               0x0000
+#define  GLUT_MENU_IN_USE                   0x0001
+#define  GLUT_NOT_VISIBLE                   0x0000
+#define  GLUT_VISIBLE                       0x0001
+#define  GLUT_HIDDEN                        0x0000
+#define  GLUT_FULLY_RETAINED                0x0001
+#define  GLUT_PARTIALLY_RETAINED            0x0002
+#define  GLUT_FULLY_COVERED                 0x0003
+
+/*
+ * GLUT API macro definitions -- fonts definitions
+ *
+ * Steve Baker suggested to make it binary compatible with GLUT:
+ */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#   define  GLUT_STROKE_ROMAN               ((void *)0x0000)
+#   define  GLUT_STROKE_MONO_ROMAN          ((void *)0x0001)
+#   define  GLUT_BITMAP_9_BY_15             ((void *)0x0002)
+#   define  GLUT_BITMAP_8_BY_13             ((void *)0x0003)
+#   define  GLUT_BITMAP_TIMES_ROMAN_10      ((void *)0x0004)
+#   define  GLUT_BITMAP_TIMES_ROMAN_24      ((void *)0x0005)
+#   define  GLUT_BITMAP_HELVETICA_10        ((void *)0x0006)
+#   define  GLUT_BITMAP_HELVETICA_12        ((void *)0x0007)
+#   define  GLUT_BITMAP_HELVETICA_18        ((void *)0x0008)
+#else
+/*
+ * I don't really know if it's a good idea... But here it goes:
+ */
+extern void *glutStrokeRoman;
+extern void *glutStrokeMonoRoman;
+extern void *glutBitmap9By15;
+extern void *glutBitmap8By13;
+extern void *glutBitmapTimesRoman10;
+extern void *glutBitmapTimesRoman24;
+extern void *glutBitmapHelvetica10;
+extern void *glutBitmapHelvetica12;
+extern void *glutBitmapHelvetica18;
+
+/*
+ * Those pointers will be used by following definitions:
+ */
+#   define  GLUT_STROKE_ROMAN               ((void *) &glutStrokeRoman)
+#   define  GLUT_STROKE_MONO_ROMAN          ((void *) &glutStrokeMonoRoman)
+#   define  GLUT_BITMAP_9_BY_15             ((void *) &glutBitmap9By15)
+#   define  GLUT_BITMAP_8_BY_13             ((void *) &glutBitmap8By13)
+#   define  GLUT_BITMAP_TIMES_ROMAN_10      ((void *) &glutBitmapTimesRoman10)
+#   define  GLUT_BITMAP_TIMES_ROMAN_24      ((void *) &glutBitmapTimesRoman24)
+#   define  GLUT_BITMAP_HELVETICA_10        ((void *) &glutBitmapHelvetica10)
+#   define  GLUT_BITMAP_HELVETICA_12        ((void *) &glutBitmapHelvetica12)
+#   define  GLUT_BITMAP_HELVETICA_18        ((void *) &glutBitmapHelvetica18)
+#endif
+
+/*
+ * GLUT API macro definitions -- the glutGet parameters
+ */
+#define  GLUT_WINDOW_X                      0x0064
+#define  GLUT_WINDOW_Y                      0x0065
+#define  GLUT_WINDOW_WIDTH                  0x0066
+#define  GLUT_WINDOW_HEIGHT                 0x0067
+#define  GLUT_WINDOW_BUFFER_SIZE            0x0068
+#define  GLUT_WINDOW_STENCIL_SIZE           0x0069
+#define  GLUT_WINDOW_DEPTH_SIZE             0x006A
+#define  GLUT_WINDOW_RED_SIZE               0x006B
+#define  GLUT_WINDOW_GREEN_SIZE             0x006C
+#define  GLUT_WINDOW_BLUE_SIZE              0x006D
+#define  GLUT_WINDOW_ALPHA_SIZE             0x006E
+#define  GLUT_WINDOW_ACCUM_RED_SIZE         0x006F
+#define  GLUT_WINDOW_ACCUM_GREEN_SIZE       0x0070
+#define  GLUT_WINDOW_ACCUM_BLUE_SIZE        0x0071
+#define  GLUT_WINDOW_ACCUM_ALPHA_SIZE       0x0072
+#define  GLUT_WINDOW_DOUBLEBUFFER           0x0073
+#define  GLUT_WINDOW_RGBA                   0x0074
+#define  GLUT_WINDOW_PARENT                 0x0075
+#define  GLUT_WINDOW_NUM_CHILDREN           0x0076
+#define  GLUT_WINDOW_COLORMAP_SIZE          0x0077
+#define  GLUT_WINDOW_NUM_SAMPLES            0x0078
+#define  GLUT_WINDOW_STEREO                 0x0079
+#define  GLUT_WINDOW_CURSOR                 0x007A
+
+#define  GLUT_SCREEN_WIDTH                  0x00C8
+#define  GLUT_SCREEN_HEIGHT                 0x00C9
+#define  GLUT_SCREEN_WIDTH_MM               0x00CA
+#define  GLUT_SCREEN_HEIGHT_MM              0x00CB
+#define  GLUT_MENU_NUM_ITEMS                0x012C
+#define  GLUT_DISPLAY_MODE_POSSIBLE         0x0190
+#define  GLUT_INIT_WINDOW_X                 0x01F4
+#define  GLUT_INIT_WINDOW_Y                 0x01F5
+#define  GLUT_INIT_WINDOW_WIDTH             0x01F6
+#define  GLUT_INIT_WINDOW_HEIGHT            0x01F7
+#define  GLUT_INIT_DISPLAY_MODE             0x01F8
+#define  GLUT_ELAPSED_TIME                  0x02BC
+#define  GLUT_WINDOW_FORMAT_ID              0x007B
+#define  GLUT_INIT_STATE                    0x007C
+
+/*
+ * GLUT API macro definitions -- the glutDeviceGet parameters
+ */
+#define  GLUT_HAS_KEYBOARD                  0x0258
+#define  GLUT_HAS_MOUSE                     0x0259
+#define  GLUT_HAS_SPACEBALL                 0x025A
+#define  GLUT_HAS_DIAL_AND_BUTTON_BOX       0x025B
+#define  GLUT_HAS_TABLET                    0x025C
+#define  GLUT_NUM_MOUSE_BUTTONS             0x025D
+#define  GLUT_NUM_SPACEBALL_BUTTONS         0x025E
+#define  GLUT_NUM_BUTTON_BOX_BUTTONS        0x025F
+#define  GLUT_NUM_DIALS                     0x0260
+#define  GLUT_NUM_TABLET_BUTTONS            0x0261
+#define  GLUT_DEVICE_IGNORE_KEY_REPEAT      0x0262
+#define  GLUT_DEVICE_KEY_REPEAT             0x0263
+#define  GLUT_HAS_JOYSTICK                  0x0264
+#define  GLUT_OWNS_JOYSTICK                 0x0265
+#define  GLUT_JOYSTICK_BUTTONS              0x0266
+#define  GLUT_JOYSTICK_AXES                 0x0267
+#define  GLUT_JOYSTICK_POLL_RATE            0x0268
+
+/*
+ * GLUT API macro definitions -- the glutLayerGet parameters
+ */
+#define  GLUT_OVERLAY_POSSIBLE              0x0320
+#define  GLUT_LAYER_IN_USE                  0x0321
+#define  GLUT_HAS_OVERLAY                   0x0322
+#define  GLUT_TRANSPARENT_INDEX             0x0323
+#define  GLUT_NORMAL_DAMAGED                0x0324
+#define  GLUT_OVERLAY_DAMAGED               0x0325
+
+/*
+ * GLUT API macro definitions -- the glutVideoResizeGet parameters
+ */
+#define  GLUT_VIDEO_RESIZE_POSSIBLE         0x0384
+#define  GLUT_VIDEO_RESIZE_IN_USE           0x0385
+#define  GLUT_VIDEO_RESIZE_X_DELTA          0x0386
+#define  GLUT_VIDEO_RESIZE_Y_DELTA          0x0387
+#define  GLUT_VIDEO_RESIZE_WIDTH_DELTA      0x0388
+#define  GLUT_VIDEO_RESIZE_HEIGHT_DELTA     0x0389
+#define  GLUT_VIDEO_RESIZE_X                0x038A
+#define  GLUT_VIDEO_RESIZE_Y                0x038B
+#define  GLUT_VIDEO_RESIZE_WIDTH            0x038C
+#define  GLUT_VIDEO_RESIZE_HEIGHT           0x038D
+
+/*
+ * GLUT API macro definitions -- the glutUseLayer parameters
+ */
+#define  GLUT_NORMAL                        0x0000
+#define  GLUT_OVERLAY                       0x0001
+
+/*
+ * GLUT API macro definitions -- the glutGetModifiers parameters
+ */
+#define  GLUT_ACTIVE_SHIFT                  0x0001
+#define  GLUT_ACTIVE_CTRL                   0x0002
+#define  GLUT_ACTIVE_ALT                    0x0004
+
+/*
+ * GLUT API macro definitions -- the glutSetCursor parameters
+ */
+#define  GLUT_CURSOR_RIGHT_ARROW            0x0000
+#define  GLUT_CURSOR_LEFT_ARROW             0x0001
+#define  GLUT_CURSOR_INFO                   0x0002
+#define  GLUT_CURSOR_DESTROY                0x0003
+#define  GLUT_CURSOR_HELP                   0x0004
+#define  GLUT_CURSOR_CYCLE                  0x0005
+#define  GLUT_CURSOR_SPRAY                  0x0006
+#define  GLUT_CURSOR_WAIT                   0x0007
+#define  GLUT_CURSOR_TEXT                   0x0008
+#define  GLUT_CURSOR_CROSSHAIR              0x0009
+#define  GLUT_CURSOR_UP_DOWN                0x000A
+#define  GLUT_CURSOR_LEFT_RIGHT             0x000B
+#define  GLUT_CURSOR_TOP_SIDE               0x000C
+#define  GLUT_CURSOR_BOTTOM_SIDE            0x000D
+#define  GLUT_CURSOR_LEFT_SIDE              0x000E
+#define  GLUT_CURSOR_RIGHT_SIDE             0x000F
+#define  GLUT_CURSOR_TOP_LEFT_CORNER        0x0010
+#define  GLUT_CURSOR_TOP_RIGHT_CORNER       0x0011
+#define  GLUT_CURSOR_BOTTOM_RIGHT_CORNER    0x0012
+#define  GLUT_CURSOR_BOTTOM_LEFT_CORNER     0x0013
+#define  GLUT_CURSOR_INHERIT                0x0064
+#define  GLUT_CURSOR_NONE                   0x0065
+#define  GLUT_CURSOR_FULL_CROSSHAIR         0x0066
+
+/*
+ * GLUT API macro definitions -- RGB color component specification definitions
+ */
+#define  GLUT_RED                           0x0000
+#define  GLUT_GREEN                         0x0001
+#define  GLUT_BLUE                          0x0002
+
+/*
+ * GLUT API macro definitions -- additional keyboard and joystick definitions
+ */
+#define  GLUT_KEY_REPEAT_OFF                0x0000
+#define  GLUT_KEY_REPEAT_ON                 0x0001
+#define  GLUT_KEY_REPEAT_DEFAULT            0x0002
+
+#define  GLUT_JOYSTICK_BUTTON_A             0x0001
+#define  GLUT_JOYSTICK_BUTTON_B             0x0002
+#define  GLUT_JOYSTICK_BUTTON_C             0x0004
+#define  GLUT_JOYSTICK_BUTTON_D             0x0008
+
+/*
+ * GLUT API macro definitions -- game mode definitions
+ */
+#define  GLUT_GAME_MODE_ACTIVE              0x0000
+#define  GLUT_GAME_MODE_POSSIBLE            0x0001
+#define  GLUT_GAME_MODE_WIDTH               0x0002
+#define  GLUT_GAME_MODE_HEIGHT              0x0003
+#define  GLUT_GAME_MODE_PIXEL_DEPTH         0x0004
+#define  GLUT_GAME_MODE_REFRESH_RATE        0x0005
+#define  GLUT_GAME_MODE_DISPLAY_CHANGED     0x0006
+
+/*
+ * Initialization functions, see fglut_init.c
+ */
+FGAPI void    FGAPIENTRY glutInit(int *pargc, char **argv);
+FGAPI void    FGAPIENTRY glutInitWindowPosition(int x, int y);
+FGAPI void    FGAPIENTRY glutInitWindowSize(int width, int height);
+FGAPI void    FGAPIENTRY glutInitDisplayMode(unsigned int displayMode);
+FGAPI void    FGAPIENTRY glutInitDisplayString(const char *displayMode);
+
+/*
+ * Process loop function, see freeglut_main.c
+ */
+FGAPI void    FGAPIENTRY glutMainLoop(void);
+
+/*
+ * Window management functions, see freeglut_window.c
+ */
+FGAPI int     FGAPIENTRY glutCreateWindow(const char *title);
+FGAPI int     FGAPIENTRY glutCreateSubWindow(int window, int x, int y, int width, int height);
+FGAPI void    FGAPIENTRY glutDestroyWindow(int window);
+FGAPI void    FGAPIENTRY glutSetWindow(int window);
+FGAPI int     FGAPIENTRY glutGetWindow(void);
+FGAPI void    FGAPIENTRY glutSetWindowTitle(const char *title);
+FGAPI void    FGAPIENTRY glutSetIconTitle(const char *title);
+FGAPI void    FGAPIENTRY glutReshapeWindow(int width, int height);
+FGAPI void    FGAPIENTRY glutPositionWindow(int x, int y);
+FGAPI void    FGAPIENTRY glutShowWindow(void);
+FGAPI void    FGAPIENTRY glutHideWindow(void);
+FGAPI void    FGAPIENTRY glutIconifyWindow(void);
+FGAPI void    FGAPIENTRY glutPushWindow(void);
+FGAPI void    FGAPIENTRY glutPopWindow(void);
+FGAPI void    FGAPIENTRY glutFullScreen(void);
+
+/*
+ * Display-connected functions, see freeglut_display.c
+ */
+FGAPI void    FGAPIENTRY glutPostWindowRedisplay(int window);
+FGAPI void    FGAPIENTRY glutPostRedisplay(void);
+FGAPI void    FGAPIENTRY glutSwapBuffers(void);
+
+/*
+ * Mouse cursor functions, see freeglut_cursor.c
+ */
+FGAPI void    FGAPIENTRY glutWarpPointer(int x, int y);
+FGAPI void    FGAPIENTRY glutSetCursor(int cursor);
+
+/*
+ * Overlay stuff, see freeglut_overlay.c
+ */
+FGAPI void    FGAPIENTRY glutEstablishOverlay(void);
+FGAPI void    FGAPIENTRY glutRemoveOverlay(void);
+FGAPI void    FGAPIENTRY glutUseLayer(GLenum layer);
+FGAPI void    FGAPIENTRY glutPostOverlayRedisplay(void);
+FGAPI void    FGAPIENTRY glutPostWindowOverlayRedisplay(int window);
+FGAPI void    FGAPIENTRY glutShowOverlay(void);
+FGAPI void    FGAPIENTRY glutHideOverlay(void);
+
+/*
+ * Menu stuff, see freeglut_menu.c
+ */
+FGAPI int     FGAPIENTRY glutCreateMenu(void (* callback)(int menu));
+FGAPI void    FGAPIENTRY glutDestroyMenu(int menu);
+FGAPI int     FGAPIENTRY glutGetMenu(void);
+FGAPI void    FGAPIENTRY glutSetMenu(int menu);
+FGAPI void    FGAPIENTRY glutAddMenuEntry(const char *label, int value);
+FGAPI void    FGAPIENTRY glutAddSubMenu(const char *label, int subMenu);
+FGAPI void    FGAPIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
+FGAPI void    FGAPIENTRY glutChangeToSubMenu(int item, const char *label, int value);
+FGAPI void    FGAPIENTRY glutRemoveMenuItem(int item);
+FGAPI void    FGAPIENTRY glutAttachMenu(int button);
+FGAPI void    FGAPIENTRY glutDetachMenu(int button);
+
+/*
+ * Global callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutTimerFunc(unsigned int time, void (* callback)(int), int value);
+FGAPI void    FGAPIENTRY glutIdleFunc(void (* callback)(void));
+
+/*
+ * Window-specific callback functions, see freeglut_callbacks.c
+ */
+FGAPI void    FGAPIENTRY glutKeyboardFunc(void (* callback)(unsigned char, int, int));
+FGAPI void    FGAPIENTRY glutSpecialFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutReshapeFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutVisibilityFunc(void (* callback)(int));
+FGAPI void    FGAPIENTRY glutDisplayFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutMouseFunc(void (* callback)(int, int, int, int));
+FGAPI void    FGAPIENTRY glutMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutPassiveMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutEntryFunc(void (* callback)(int));
+
+FGAPI void    FGAPIENTRY glutKeyboardUpFunc(void (* callback)(unsigned char, int, int));
+FGAPI void    FGAPIENTRY glutSpecialUpFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutJoystickFunc(void (* callback)(unsigned int, int, int, int), int pollInterval);
+FGAPI void    FGAPIENTRY glutMenuStateFunc(void (* callback)(int));
+FGAPI void    FGAPIENTRY glutMenuStatusFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutOverlayDisplayFunc(void (* callback)(void));
+FGAPI void    FGAPIENTRY glutWindowStatusFunc(void (* callback)(int));
+
+FGAPI void    FGAPIENTRY glutSpaceballMotionFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutSpaceballRotateFunc(void (* callback)(int, int, int));
+FGAPI void    FGAPIENTRY glutSpaceballButtonFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutButtonBoxFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutDialsFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutTabletMotionFunc(void (* callback)(int, int));
+FGAPI void    FGAPIENTRY glutTabletButtonFunc(void (* callback)(int, int, int, int));
+
+/*
+ * State setting and retrieval functions, see freeglut_state.c
+ */
+FGAPI int     FGAPIENTRY glutGet(GLenum query);
+FGAPI int     FGAPIENTRY glutDeviceGet(GLenum query);
+FGAPI int     FGAPIENTRY glutGetModifiers(void);
+FGAPI int     FGAPIENTRY glutLayerGet(GLenum query);
+
+/*
+ * Font stuff, see freeglut_font.c
+ */
+FGAPI void    FGAPIENTRY glutBitmapCharacter(void *font, int character);
+FGAPI int     FGAPIENTRY glutBitmapWidth(void *font, int character);
+FGAPI void    FGAPIENTRY glutStrokeCharacter(void *font, int character);
+FGAPI int     FGAPIENTRY glutStrokeWidth(void *font, int character);
+FGAPI int     FGAPIENTRY glutBitmapLength(void *font, const unsigned char *string);
+FGAPI int     FGAPIENTRY glutStrokeLength(void *font, const unsigned char *string);
+
+/*
+ * Geometry functions, see freeglut_geometry.c
+ */
+FGAPI void    FGAPIENTRY glutWireCube(GLdouble size);
+FGAPI void    FGAPIENTRY glutSolidCube(GLdouble size);
+FGAPI void    FGAPIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+FGAPI void    FGAPIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+
+FGAPI void    FGAPIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+FGAPI void    FGAPIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+FGAPI void    FGAPIENTRY glutWireDodecahedron(void);
+FGAPI void    FGAPIENTRY glutSolidDodecahedron(void);
+FGAPI void    FGAPIENTRY glutWireOctahedron(void);
+FGAPI void    FGAPIENTRY glutSolidOctahedron(void);
+FGAPI void    FGAPIENTRY glutWireTetrahedron(void);
+FGAPI void    FGAPIENTRY glutSolidTetrahedron(void);
+FGAPI void    FGAPIENTRY glutWireIcosahedron(void);
+FGAPI void    FGAPIENTRY glutSolidIcosahedron(void);
+
+/*
+ * Teapot rendering functions, found in freeglut_teapot.c
+ */
+FGAPI void    FGAPIENTRY glutWireTeapot(GLdouble size);
+FGAPI void    FGAPIENTRY glutSolidTeapot(GLdouble size);
+
+/*
+ * Game mode functions, see freeglut_gamemode.c
+ */
+FGAPI void    FGAPIENTRY glutGameModeString(const char *string);
+FGAPI int     FGAPIENTRY glutEnterGameMode(void);
+FGAPI void    FGAPIENTRY glutLeaveGameMode(void);
+FGAPI int     FGAPIENTRY glutGameModeGet(GLenum query);
+
+/*
+ * Video resize functions, see freeglut_videoresize.c
+ */
+FGAPI int     FGAPIENTRY glutVideoResizeGet(GLenum query);
+FGAPI void    FGAPIENTRY glutSetupVideoResizing(void);
+FGAPI void    FGAPIENTRY glutStopVideoResizing(void);
+FGAPI void    FGAPIENTRY glutVideoResize(int x, int y, int width, int height);
+FGAPI void    FGAPIENTRY glutVideoPan(int x, int y, int width, int height);
+
+/*
+ * Colormap functions, see freeglut_misc.c
+ */
+FGAPI void    FGAPIENTRY glutSetColor(int color, GLfloat red, GLfloat green, GLfloat blue);
+FGAPI GLfloat FGAPIENTRY glutGetColor(int color, int component);
+FGAPI void    FGAPIENTRY glutCopyColormap(int window);
+
+/*
+ * Misc keyboard and joystick functions, see freeglut_misc.c
+ */
+FGAPI void    FGAPIENTRY glutIgnoreKeyRepeat(int ignore);
+FGAPI void    FGAPIENTRY glutSetKeyRepeat(int repeatMode);    /* DEPRECATED 11/4/02 - Do not use */
+FGAPI void    FGAPIENTRY glutForceJoystickFunc(void);
+
+/*
+ * Misc functions, see freeglut_misc.c
+ */
+FGAPI int     FGAPIENTRY glutExtensionSupported(const char *extension);
+FGAPI void    FGAPIENTRY glutReportErrors(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*** END OF FILE ***/
+
+#endif /* __FREEGLUT_STD_H__ */
+

File diff suppressed because it is too large
+ 14457 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glew.h


File diff suppressed because it is too large
+ 7125 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glext.h


+ 597 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glut.h

@@ -0,0 +1,597 @@
+#ifndef __glut_h__
+#define __glut_h__
+
+/* Copyright (c) Mark J. Kilgard, 1994, 1995, 1996, 1998. */
+
+/* This program is freely distributable without licensing fees  and is
+   provided without guarantee or warrantee expressed or  implied. This
+   program is -not- in the public domain. */
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+
+/* GLUT 3.7 now tries to avoid including <windows.h>
+   to avoid name space pollution, but Win32's <GL/gl.h>
+   needs APIENTRY and WINGDIAPI defined properly. */
+# if 0
+#  define  WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+# else
+/* XXX This is from Win32's <windef.h> */
+#  ifndef APIENTRY
+#   define GLUT_APIENTRY_DEFINED
+#   if (_MSC_VER >= 800) || defined(_STDCALL_SUPPORTED)
+#    define APIENTRY    __stdcall
+#   else
+#    define APIENTRY
+#   endif
+#  endif
+/* XXX This is from Win32's <winnt.h> */
+#  ifndef CALLBACK
+#   if (defined(_M_MRX000) || defined(_M_IX86) || defined(_M_ALPHA) || defined(_M_PPC)) && !defined(MIDL_PASS)
+#    define CALLBACK __stdcall
+#   else
+#    define CALLBACK
+#   endif
+#  endif
+/* XXX This is from Win32's <wingdi.h> and <winnt.h> */
+#  ifndef WINGDIAPI
+#   define GLUT_WINGDIAPI_DEFINED
+#   define WINGDIAPI __declspec(dllimport)
+#  endif
+/* XXX This is from Win32's <ctype.h> */
+#  ifndef _WCHAR_T_DEFINED
+typedef unsigned short wchar_t;
+#   define _WCHAR_T_DEFINED
+#  endif
+# endif
+
+#pragma comment (lib, "winmm.lib")     /* link with Windows MultiMedia lib */
+#pragma comment (lib, "opengl32.lib")  /* link with Microsoft OpenGL lib */
+#pragma comment (lib, "glu32.lib")     /* link with OpenGL Utility lib */
+#pragma message("Note: including lib: glut32.lib\n")
+#pragma comment (lib, "glut32.lib")    /* link with Win32 GLUT lib */
+
+#pragma warning (disable:4244)  /* Disable bogus conversion warnings. */
+#pragma warning (disable:4305)  /* VC++ 5.0 version of above warning. */
+
+#endif
+
+#include <GL/gl.h>
+#include <GL/glu.h>
+
+/* define APIENTRY and CALLBACK to null string if we aren't on Win32 */
+#if !defined(WIN32)
+#define APIENTRY
+#define GLUT_APIENTRY_DEFINED
+#define CALLBACK
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ GLUT API revision history:
+
+ GLUT_API_VERSION is updated to reflect incompatible GLUT
+ API changes (interface changes, semantic changes, deletions,
+ or additions).
+
+ GLUT_API_VERSION=1  First public release of GLUT.  11/29/94
+
+ GLUT_API_VERSION=2  Added support for OpenGL/GLX multisampling,
+ extension.  Supports new input devices like tablet, dial and button
+ box, and Spaceball.  Easy to query OpenGL extensions.
+
+ GLUT_API_VERSION=3  glutMenuStatus added.
+
+ GLUT_API_VERSION=4  glutInitDisplayString, glutWarpPointer,
+ glutBitmapLength, glutStrokeLength, glutWindowStatusFunc, dynamic
+ video resize subAPI, glutPostWindowRedisplay, glutKeyboardUpFunc,
+ glutSpecialUpFunc, glutIgnoreKeyRepeat, glutSetKeyRepeat,
+ glutJoystickFunc, glutForceJoystickFunc (NOT FINALIZED!).
+**/
+#ifndef GLUT_API_VERSION  /* allow this to be overriden */
+#define GLUT_API_VERSION        3
+#endif
+
+/**
+ GLUT implementation revision history:
+
+ GLUT_XLIB_IMPLEMENTATION is updated to reflect both GLUT
+ API revisions and implementation revisions (ie, bug fixes).
+
+ GLUT_XLIB_IMPLEMENTATION=1  mjk's first public release of
+ GLUT Xlib-based implementation.  11/29/94
+
+ GLUT_XLIB_IMPLEMENTATION=2  mjk's second public release of
+ GLUT Xlib-based implementation providing GLUT version 2
+ interfaces.
+
+ GLUT_XLIB_IMPLEMENTATION=3  mjk's GLUT 2.2 images. 4/17/95
+
+ GLUT_XLIB_IMPLEMENTATION=4  mjk's GLUT 2.3 images. 6/?/95
+
+ GLUT_XLIB_IMPLEMENTATION=5  mjk's GLUT 3.0 images. 10/?/95
+
+ GLUT_XLIB_IMPLEMENTATION=7  mjk's GLUT 3.1+ with glutWarpPoitner.  7/24/96
+
+ GLUT_XLIB_IMPLEMENTATION=8  mjk's GLUT 3.1+ with glutWarpPoitner
+ and video resize.  1/3/97
+
+ GLUT_XLIB_IMPLEMENTATION=9 mjk's GLUT 3.4 release with early GLUT 4 routines.
+
+ GLUT_XLIB_IMPLEMENTATION=11 Mesa 2.5's GLUT 3.6 release.
+
+ GLUT_XLIB_IMPLEMENTATION=12 mjk's GLUT 3.6 release with early GLUT 4 routines + signal handling.
+
+ GLUT_XLIB_IMPLEMENTATION=13 mjk's GLUT 3.7 release with GameGLUT support.
+**/
+#ifndef GLUT_XLIB_IMPLEMENTATION  /* Allow this to be overriden. */
+#define GLUT_XLIB_IMPLEMENTATION    13
+#endif
+
+/* Display mode bit masks. */
+#define GLUT_RGB            0
+#define GLUT_RGBA           GLUT_RGB
+#define GLUT_INDEX          1
+#define GLUT_SINGLE         0
+#define GLUT_DOUBLE         2
+#define GLUT_ACCUM          4
+#define GLUT_ALPHA          8
+#define GLUT_DEPTH          16
+#define GLUT_STENCIL            32
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_MULTISAMPLE        128
+#define GLUT_STEREO         256
+#endif
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_LUMINANCE          512
+#endif
+
+/* Mouse buttons. */
+#define GLUT_LEFT_BUTTON        0
+#define GLUT_MIDDLE_BUTTON      1
+#define GLUT_RIGHT_BUTTON       2
+
+/* Mouse button  state. */
+#define GLUT_DOWN           0
+#define GLUT_UP             1
+
+#if (GLUT_API_VERSION >= 2)
+/* function keys */
+#define GLUT_KEY_F1         1
+#define GLUT_KEY_F2         2
+#define GLUT_KEY_F3         3
+#define GLUT_KEY_F4         4
+#define GLUT_KEY_F5         5
+#define GLUT_KEY_F6         6
+#define GLUT_KEY_F7         7
+#define GLUT_KEY_F8         8
+#define GLUT_KEY_F9         9
+#define GLUT_KEY_F10            10
+#define GLUT_KEY_F11            11
+#define GLUT_KEY_F12            12
+/* directional keys */
+#define GLUT_KEY_LEFT           100
+#define GLUT_KEY_UP         101
+#define GLUT_KEY_RIGHT          102
+#define GLUT_KEY_DOWN           103
+#define GLUT_KEY_PAGE_UP        104
+#define GLUT_KEY_PAGE_DOWN      105
+#define GLUT_KEY_HOME           106
+#define GLUT_KEY_END            107
+#define GLUT_KEY_INSERT         108
+#endif
+
+/* Entry/exit  state. */
+#define GLUT_LEFT           0
+#define GLUT_ENTERED            1
+
+/* Menu usage  state. */
+#define GLUT_MENU_NOT_IN_USE        0
+#define GLUT_MENU_IN_USE        1
+
+/* Visibility  state. */
+#define GLUT_NOT_VISIBLE        0
+#define GLUT_VISIBLE            1
+
+/* Window status  state. */
+#define GLUT_HIDDEN         0
+#define GLUT_FULLY_RETAINED     1
+#define GLUT_PARTIALLY_RETAINED     2
+#define GLUT_FULLY_COVERED      3
+
+/* Color index component selection values. */
+#define GLUT_RED            0
+#define GLUT_GREEN          1
+#define GLUT_BLUE           2
+
+/* Layers for use. */
+#define GLUT_NORMAL         0
+#define GLUT_OVERLAY            1
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+/* Stroke font constants (use these in GLUT program). */
+#define GLUT_STROKE_ROMAN       ((void*)0)
+#define GLUT_STROKE_MONO_ROMAN      ((void*)1)
+
+/* Bitmap font constants (use these in GLUT program). */
+#define GLUT_BITMAP_9_BY_15     ((void*)2)
+#define GLUT_BITMAP_8_BY_13     ((void*)3)
+#define GLUT_BITMAP_TIMES_ROMAN_10  ((void*)4)
+#define GLUT_BITMAP_TIMES_ROMAN_24  ((void*)5)
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_BITMAP_HELVETICA_10    ((void*)6)
+#define GLUT_BITMAP_HELVETICA_12    ((void*)7)
+#define GLUT_BITMAP_HELVETICA_18    ((void*)8)
+#endif
+#else
+/* Stroke font opaque addresses (use constants instead in source code). */
+extern void *glutStrokeRoman;
+extern void *glutStrokeMonoRoman;
+
+/* Stroke font constants (use these in GLUT program). */
+#define GLUT_STROKE_ROMAN       (&glutStrokeRoman)
+#define GLUT_STROKE_MONO_ROMAN      (&glutStrokeMonoRoman)
+
+/* Bitmap font opaque addresses (use constants instead in source code). */
+extern void *glutBitmap9By15;
+extern void *glutBitmap8By13;
+extern void *glutBitmapTimesRoman10;
+extern void *glutBitmapTimesRoman24;
+extern void *glutBitmapHelvetica10;
+extern void *glutBitmapHelvetica12;
+extern void *glutBitmapHelvetica18;
+
+/* Bitmap font constants (use these in GLUT program). */
+#define GLUT_BITMAP_9_BY_15     (&glutBitmap9By15)
+#define GLUT_BITMAP_8_BY_13     (&glutBitmap8By13)
+#define GLUT_BITMAP_TIMES_ROMAN_10  (&glutBitmapTimesRoman10)
+#define GLUT_BITMAP_TIMES_ROMAN_24  (&glutBitmapTimesRoman24)
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_BITMAP_HELVETICA_10    (&glutBitmapHelvetica10)
+#define GLUT_BITMAP_HELVETICA_12    (&glutBitmapHelvetica12)
+#define GLUT_BITMAP_HELVETICA_18    (&glutBitmapHelvetica18)
+#endif
+#endif
+
+/* glutGet parameters. */
+#define GLUT_WINDOW_X           100
+#define GLUT_WINDOW_Y           101
+#define GLUT_WINDOW_WIDTH       102
+#define GLUT_WINDOW_HEIGHT      103
+#define GLUT_WINDOW_BUFFER_SIZE     104
+#define GLUT_WINDOW_STENCIL_SIZE    105
+#define GLUT_WINDOW_DEPTH_SIZE      106
+#define GLUT_WINDOW_RED_SIZE        107
+#define GLUT_WINDOW_GREEN_SIZE      108
+#define GLUT_WINDOW_BLUE_SIZE       109
+#define GLUT_WINDOW_ALPHA_SIZE      110
+#define GLUT_WINDOW_ACCUM_RED_SIZE  111
+#define GLUT_WINDOW_ACCUM_GREEN_SIZE    112
+#define GLUT_WINDOW_ACCUM_BLUE_SIZE 113
+#define GLUT_WINDOW_ACCUM_ALPHA_SIZE    114
+#define GLUT_WINDOW_DOUBLEBUFFER    115
+#define GLUT_WINDOW_RGBA        116
+#define GLUT_WINDOW_PARENT      117
+#define GLUT_WINDOW_NUM_CHILDREN    118
+#define GLUT_WINDOW_COLORMAP_SIZE   119
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_WINDOW_NUM_SAMPLES     120
+#define GLUT_WINDOW_STEREO      121
+#endif
+#if (GLUT_API_VERSION >= 3)
+#define GLUT_WINDOW_CURSOR      122
+#endif
+#define GLUT_SCREEN_WIDTH       200
+#define GLUT_SCREEN_HEIGHT      201
+#define GLUT_SCREEN_WIDTH_MM        202
+#define GLUT_SCREEN_HEIGHT_MM       203
+#define GLUT_MENU_NUM_ITEMS     300
+#define GLUT_DISPLAY_MODE_POSSIBLE  400
+#define GLUT_INIT_WINDOW_X      500
+#define GLUT_INIT_WINDOW_Y      501
+#define GLUT_INIT_WINDOW_WIDTH      502
+#define GLUT_INIT_WINDOW_HEIGHT     503
+#define GLUT_INIT_DISPLAY_MODE      504
+#if (GLUT_API_VERSION >= 2)
+#define GLUT_ELAPSED_TIME       700
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+#define GLUT_WINDOW_FORMAT_ID       123
+#endif
+
+#if (GLUT_API_VERSION >= 2)
+/* glutDeviceGet parameters. */
+#define GLUT_HAS_KEYBOARD       600
+#define GLUT_HAS_MOUSE          601
+#define GLUT_HAS_SPACEBALL      602
+#define GLUT_HAS_DIAL_AND_BUTTON_BOX    603
+#define GLUT_HAS_TABLET         604
+#define GLUT_NUM_MOUSE_BUTTONS      605
+#define GLUT_NUM_SPACEBALL_BUTTONS  606
+#define GLUT_NUM_BUTTON_BOX_BUTTONS 607
+#define GLUT_NUM_DIALS          608
+#define GLUT_NUM_TABLET_BUTTONS     609
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+#define GLUT_DEVICE_IGNORE_KEY_REPEAT   610
+#define GLUT_DEVICE_KEY_REPEAT          611
+#define GLUT_HAS_JOYSTICK       612
+#define GLUT_OWNS_JOYSTICK      613
+#define GLUT_JOYSTICK_BUTTONS       614
+#define GLUT_JOYSTICK_AXES      615
+#define GLUT_JOYSTICK_POLL_RATE     616
+#endif
+
+#if (GLUT_API_VERSION >= 3)
+/* glutLayerGet parameters. */
+#define GLUT_OVERLAY_POSSIBLE           800
+#define GLUT_LAYER_IN_USE       801
+#define GLUT_HAS_OVERLAY        802
+#define GLUT_TRANSPARENT_INDEX      803
+#define GLUT_NORMAL_DAMAGED     804
+#define GLUT_OVERLAY_DAMAGED        805
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+/* glutVideoResizeGet parameters. */
+#define GLUT_VIDEO_RESIZE_POSSIBLE  900
+#define GLUT_VIDEO_RESIZE_IN_USE    901
+#define GLUT_VIDEO_RESIZE_X_DELTA   902
+#define GLUT_VIDEO_RESIZE_Y_DELTA   903
+#define GLUT_VIDEO_RESIZE_WIDTH_DELTA   904
+#define GLUT_VIDEO_RESIZE_HEIGHT_DELTA  905
+#define GLUT_VIDEO_RESIZE_X     906
+#define GLUT_VIDEO_RESIZE_Y     907
+#define GLUT_VIDEO_RESIZE_WIDTH     908
+#define GLUT_VIDEO_RESIZE_HEIGHT    909
+#endif
+
+/* glutUseLayer parameters. */
+#define GLUT_NORMAL         0
+#define GLUT_OVERLAY            1
+
+/* glutGetModifiers return mask. */
+#define GLUT_ACTIVE_SHIFT               1
+#define GLUT_ACTIVE_CTRL                2
+#define GLUT_ACTIVE_ALT                 4
+
+/* glutSetCursor parameters. */
+/* Basic arrows. */
+#define GLUT_CURSOR_RIGHT_ARROW     0
+#define GLUT_CURSOR_LEFT_ARROW      1
+/* Symbolic cursor shapes. */
+#define GLUT_CURSOR_INFO        2
+#define GLUT_CURSOR_DESTROY     3
+#define GLUT_CURSOR_HELP        4
+#define GLUT_CURSOR_CYCLE       5
+#define GLUT_CURSOR_SPRAY       6
+#define GLUT_CURSOR_WAIT        7
+#define GLUT_CURSOR_TEXT        8
+#define GLUT_CURSOR_CROSSHAIR       9
+/* Directional cursors. */
+#define GLUT_CURSOR_UP_DOWN     10
+#define GLUT_CURSOR_LEFT_RIGHT      11
+/* Sizing cursors. */
+#define GLUT_CURSOR_TOP_SIDE        12
+#define GLUT_CURSOR_BOTTOM_SIDE     13
+#define GLUT_CURSOR_LEFT_SIDE       14
+#define GLUT_CURSOR_RIGHT_SIDE      15
+#define GLUT_CURSOR_TOP_LEFT_CORNER 16
+#define GLUT_CURSOR_TOP_RIGHT_CORNER    17
+#define GLUT_CURSOR_BOTTOM_RIGHT_CORNER 18
+#define GLUT_CURSOR_BOTTOM_LEFT_CORNER  19
+/* Inherit from parent window. */
+#define GLUT_CURSOR_INHERIT     100
+/* Blank cursor. */
+#define GLUT_CURSOR_NONE        101
+/* Fullscreen crosshair (if available). */
+#define GLUT_CURSOR_FULL_CROSSHAIR  102
+#endif
+
+/* GLUT initialization sub-API. */
+extern void APIENTRY glutInit(int *argcp, char **argv);
+extern void APIENTRY glutInitDisplayMode(unsigned int mode);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutInitDisplayString(const char *string);
+#endif
+extern void APIENTRY glutInitWindowPosition(int x, int y);
+extern void APIENTRY glutInitWindowSize(int width, int height);
+extern void APIENTRY glutMainLoop(void);
+
+/* GLUT window sub-API. */
+extern int APIENTRY glutCreateWindow(const char *title);
+extern int APIENTRY glutCreateSubWindow(int win, int x, int y, int width, int height);
+extern void APIENTRY glutDestroyWindow(int win);
+extern void APIENTRY glutPostRedisplay(void);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
+extern void APIENTRY glutPostWindowRedisplay(int win);
+#endif
+extern void APIENTRY glutSwapBuffers(void);
+extern int APIENTRY glutGetWindow(void);
+extern void APIENTRY glutSetWindow(int win);
+extern void APIENTRY glutSetWindowTitle(const char *title);
+extern void APIENTRY glutSetIconTitle(const char *title);
+extern void APIENTRY glutPositionWindow(int x, int y);
+extern void APIENTRY glutReshapeWindow(int width, int height);
+extern void APIENTRY glutPopWindow(void);
+extern void APIENTRY glutPushWindow(void);
+extern void APIENTRY glutIconifyWindow(void);
+extern void APIENTRY glutShowWindow(void);
+extern void APIENTRY glutHideWindow(void);
+#if (GLUT_API_VERSION >= 3)
+extern void APIENTRY glutFullScreen(void);
+extern void APIENTRY glutSetCursor(int cursor);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutWarpPointer(int x, int y);
+#endif
+
+/* GLUT overlay sub-API. */
+extern void APIENTRY glutEstablishOverlay(void);
+extern void APIENTRY glutRemoveOverlay(void);
+extern void APIENTRY glutUseLayer(GLenum layer);
+extern void APIENTRY glutPostOverlayRedisplay(void);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
+extern void APIENTRY glutPostWindowOverlayRedisplay(int win);
+#endif
+extern void APIENTRY glutShowOverlay(void);
+extern void APIENTRY glutHideOverlay(void);
+#endif
+
+/* GLUT menu sub-API. */
+extern int APIENTRY glutCreateMenu(void ( *)(int));
+extern void APIENTRY glutDestroyMenu(int menu);
+extern int APIENTRY glutGetMenu(void);
+extern void APIENTRY glutSetMenu(int menu);
+extern void APIENTRY glutAddMenuEntry(const char *label, int value);
+extern void APIENTRY glutAddSubMenu(const char *label, int submenu);
+extern void APIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
+extern void APIENTRY glutChangeToSubMenu(int item, const char *label, int submenu);
+extern void APIENTRY glutRemoveMenuItem(int item);
+extern void APIENTRY glutAttachMenu(int button);
+extern void APIENTRY glutDetachMenu(int button);
+
+/* GLUT window callback sub-API. */
+extern void APIENTRY glutDisplayFunc(void (*func)(void));
+extern void APIENTRY glutReshapeFunc(void (*func)(int width, int height));
+extern void APIENTRY glutKeyboardFunc(void (*func)(unsigned char key, int x, int y));
+extern void APIENTRY glutMouseFunc(void (*func)(int button, int state, int x, int y));
+extern void APIENTRY glutMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutPassiveMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutEntryFunc(void (*func)(int state));
+extern void APIENTRY glutVisibilityFunc(void (*func)(int state));
+extern void APIENTRY glutIdleFunc(void (*func)(void));
+extern void APIENTRY glutTimerFunc(unsigned int millis, void (*func)(int value), int value);
+extern void APIENTRY glutMenuStateFunc(void (*func)(int state));
+#if (GLUT_API_VERSION >= 2)
+extern void APIENTRY glutSpecialFunc(void (*func)(int key, int x, int y));
+extern void APIENTRY glutSpaceballMotionFunc(void (*func)(int x, int y, int z));
+extern void APIENTRY glutSpaceballRotateFunc(void (*func)(int x, int y, int z));
+extern void APIENTRY glutSpaceballButtonFunc(void (*func)(int button, int state));
+extern void APIENTRY glutButtonBoxFunc(void (*func)(int button, int state));
+extern void APIENTRY glutDialsFunc(void (*func)(int dial, int value));
+extern void APIENTRY glutTabletMotionFunc(void (*func)(int x, int y));
+extern void APIENTRY glutTabletButtonFunc(void (*func)(int button, int state, int x, int y));
+#if (GLUT_API_VERSION >= 3)
+extern void APIENTRY glutMenuStatusFunc(void (*func)(int status, int x, int y));
+extern void APIENTRY glutOverlayDisplayFunc(void (*func)(void));
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern void APIENTRY glutWindowStatusFunc(void (*func)(int state));
+#endif
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+extern void APIENTRY glutKeyboardUpFunc(void (*func)(unsigned char key, int x, int y));
+extern void APIENTRY glutSpecialUpFunc(void (*func)(int key, int x, int y));
+extern void APIENTRY glutJoystickFunc(void (*func)(unsigned int buttonMask, int x, int y, int z), int pollInterval);
+#endif
+#endif
+#endif
+
+/* GLUT color index sub-API. */
+extern void APIENTRY glutSetColor(int, GLfloat red, GLfloat green, GLfloat blue);
+extern GLfloat APIENTRY glutGetColor(int ndx, int component);
+extern void APIENTRY glutCopyColormap(int win);
+
+/* GLUT state retrieval sub-API. */
+extern int APIENTRY glutGet(GLenum type);
+extern int APIENTRY glutDeviceGet(GLenum type);
+#if (GLUT_API_VERSION >= 2)
+/* GLUT extension support sub-API */
+extern int APIENTRY glutExtensionSupported(const char *name);
+#endif
+#if (GLUT_API_VERSION >= 3)
+extern int APIENTRY glutGetModifiers(void);
+extern int APIENTRY glutLayerGet(GLenum type);
+#endif
+
+/* GLUT font sub-API */
+extern void APIENTRY glutBitmapCharacter(void *font, int character);
+extern int APIENTRY glutBitmapWidth(void *font, int character);
+extern void APIENTRY glutStrokeCharacter(void *font, int character);
+extern int APIENTRY glutStrokeWidth(void *font, int character);
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+extern int APIENTRY glutBitmapLength(void *font, const unsigned char *string);
+extern int APIENTRY glutStrokeLength(void *font, const unsigned char *string);
+#endif
+
+/* GLUT pre-built models sub-API */
+extern void APIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
+extern void APIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
+extern void APIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+extern void APIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
+extern void APIENTRY glutWireCube(GLdouble size);
+extern void APIENTRY glutSolidCube(GLdouble size);
+extern void APIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+extern void APIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
+extern void APIENTRY glutWireDodecahedron(void);
+extern void APIENTRY glutSolidDodecahedron(void);
+extern void APIENTRY glutWireTeapot(GLdouble size);
+extern void APIENTRY glutSolidTeapot(GLdouble size);
+extern void APIENTRY glutWireOctahedron(void);
+extern void APIENTRY glutSolidOctahedron(void);
+extern void APIENTRY glutWireTetrahedron(void);
+extern void APIENTRY glutSolidTetrahedron(void);
+extern void APIENTRY glutWireIcosahedron(void);
+extern void APIENTRY glutSolidIcosahedron(void);
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
+/* GLUT video resize sub-API. */
+extern int APIENTRY glutVideoResizeGet(GLenum param);
+extern void APIENTRY glutSetupVideoResizing(void);
+extern void APIENTRY glutStopVideoResizing(void);
+extern void APIENTRY glutVideoResize(int x, int y, int width, int height);
+extern void APIENTRY glutVideoPan(int x, int y, int width, int height);
+
+/* GLUT debugging sub-API. */
+extern void APIENTRY glutReportErrors(void);
+#endif
+
+#if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
+/* GLUT device control sub-API. */
+/* glutSetKeyRepeat modes. */
+#define GLUT_KEY_REPEAT_OFF     0
+#define GLUT_KEY_REPEAT_ON      1
+#define GLUT_KEY_REPEAT_DEFAULT     2
+
+/* Joystick button masks. */
+#define GLUT_JOYSTICK_BUTTON_A      1
+#define GLUT_JOYSTICK_BUTTON_B      2
+#define GLUT_JOYSTICK_BUTTON_C      4
+#define GLUT_JOYSTICK_BUTTON_D      8
+
+extern void APIENTRY glutIgnoreKeyRepeat(int ignore);
+extern void APIENTRY glutSetKeyRepeat(int repeatMode);
+extern void APIENTRY glutForceJoystickFunc(void);
+
+/* GLUT game mode sub-API. */
+/* glutGameModeGet. */
+#define GLUT_GAME_MODE_ACTIVE           0
+#define GLUT_GAME_MODE_POSSIBLE         1
+#define GLUT_GAME_MODE_WIDTH            2
+#define GLUT_GAME_MODE_HEIGHT           3
+#define GLUT_GAME_MODE_PIXEL_DEPTH      4
+#define GLUT_GAME_MODE_REFRESH_RATE     5
+#define GLUT_GAME_MODE_DISPLAY_CHANGED  6
+
+extern void APIENTRY glutGameModeString(const char *string);
+extern int APIENTRY glutEnterGameMode(void);
+extern void APIENTRY glutLeaveGameMode(void);
+extern int APIENTRY glutGameModeGet(GLenum mode);
+#endif
+
+#ifdef __cplusplus
+}
+
+#endif
+
+#ifdef GLUT_APIENTRY_DEFINED
+# undef GLUT_APIENTRY_DEFINED
+# undef APIENTRY
+#endif
+
+#ifdef GLUT_WINGDIAPI_DEFINED
+# undef GLUT_WINGDIAPI_DEFINED
+# undef WINGDIAPI
+#endif
+
+#endif                  /* __glut_h__ */

File diff suppressed because it is too large
+ 1121 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxew.h


+ 805 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/glxext.h

@@ -0,0 +1,805 @@
+#ifndef __glxext_h_
+#define __glxext_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** License Applicability. Except to the extent portions of this file are
+** made subject to an alternative license as permitted in the SGI Free
+** Software License B, Version 1.1 (the "License"), the contents of this
+** file are subject only to the provisions of the License. You may not use
+** this file except in compliance with the License. You may obtain a copy
+** of the License at Silicon Graphics, Inc., attn: Legal Services, 1600
+** Amphitheatre Parkway, Mountain View, CA 94043-1351, or at:
+**
+** http://oss.sgi.com/projects/FreeB
+**
+** Note that, as provided in the License, the Software is distributed on an
+** "AS IS" basis, with ALL EXPRESS AND IMPLIED WARRANTIES AND CONDITIONS
+** DISCLAIMED, INCLUDING, WITHOUT LIMITATION, ANY IMPLIED WARRANTIES AND
+** CONDITIONS OF MERCHANTABILITY, SATISFACTORY QUALITY, FITNESS FOR A
+** PARTICULAR PURPOSE, AND NON-INFRINGEMENT.
+**
+** Original Code. The Original Code is: OpenGL Sample Implementation,
+** Version 1.2.1, released January 26, 2000, developed by Silicon Graphics,
+** Inc. The Original Code is Copyright (c) 1991-2004 Silicon Graphics, Inc.
+** Copyright in any portions created by third parties is as indicated
+** elsewhere herein. All Rights Reserved.
+**
+** Additional Notice Provisions: This software was created using the
+** OpenGL(R) version 1.2.1 Sample Implementation published by SGI, but has
+** not been independently verified as being compliant with the OpenGL(R)
+** version 1.2.1 Specification.
+*/
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
+#define WIN32_LEAN_AND_MEAN 1
+#include <windows.h>
+#endif
+
+#ifndef APIENTRY
+#define APIENTRY
+#endif
+#ifndef APIENTRYP
+#define APIENTRYP APIENTRY *
+#endif
+#ifndef GLAPI
+#define GLAPI extern
+#endif
+
+/*************************************************************/
+
+/* Header file version number, required by OpenGL ABI for Linux */
+/* glxext.h last updated 2005/01/20 */
+/* Current version at http://oss.sgi.com/projects/ogl-sample/registry/ */
+#define GLX_GLXEXT_VERSION 10
+
+#ifndef GLX_ARB_get_proc_address
+#endif
+
+#ifndef GLX_ARB_multisample
+#define GLX_SAMPLE_BUFFERS_ARB             100000
+#define GLX_SAMPLES_ARB                    100001
+#endif
+
+#ifndef GLX_ARB_fbconfig_float
+#define GLX_RGBA_FLOAT_TYPE_ARB            0x20B9
+#define GLX_RGBA_FLOAT_BIT_ARB             0x00000004
+#endif
+
+#ifndef GLX_SGIS_multisample
+#define GLX_SAMPLE_BUFFERS_SGIS            100000
+#define GLX_SAMPLES_SGIS                   100001
+#endif
+
+#ifndef GLX_EXT_visual_info
+#define GLX_X_VISUAL_TYPE_EXT              0x22
+#define GLX_TRANSPARENT_TYPE_EXT           0x23
+#define GLX_TRANSPARENT_INDEX_VALUE_EXT    0x24
+#define GLX_TRANSPARENT_RED_VALUE_EXT      0x25
+#define GLX_TRANSPARENT_GREEN_VALUE_EXT    0x26
+#define GLX_TRANSPARENT_BLUE_VALUE_EXT     0x27
+#define GLX_TRANSPARENT_ALPHA_VALUE_EXT    0x28
+#define GLX_NONE_EXT                       0x8000
+#define GLX_TRUE_COLOR_EXT                 0x8002
+#define GLX_DIRECT_COLOR_EXT               0x8003
+#define GLX_PSEUDO_COLOR_EXT               0x8004
+#define GLX_STATIC_COLOR_EXT               0x8005
+#define GLX_GRAY_SCALE_EXT                 0x8006
+#define GLX_STATIC_GRAY_EXT                0x8007
+#define GLX_TRANSPARENT_RGB_EXT            0x8008
+#define GLX_TRANSPARENT_INDEX_EXT          0x8009
+#endif
+
+#ifndef GLX_SGI_swap_control
+#endif
+
+#ifndef GLX_SGI_video_sync
+#endif
+
+#ifndef GLX_SGI_make_current_read
+#endif
+
+#ifndef GLX_SGIX_video_source
+#endif
+
+#ifndef GLX_EXT_visual_rating
+#define GLX_VISUAL_CAVEAT_EXT              0x20
+#define GLX_SLOW_VISUAL_EXT                0x8001
+#define GLX_NON_CONFORMANT_VISUAL_EXT      0x800D
+/* reuse GLX_NONE_EXT */
+#endif
+
+#ifndef GLX_EXT_import_context
+#define GLX_SHARE_CONTEXT_EXT              0x800A
+#define GLX_VISUAL_ID_EXT                  0x800B
+#define GLX_SCREEN_EXT                     0x800C
+#endif
+
+#ifndef GLX_SGIX_fbconfig
+#define GLX_WINDOW_BIT_SGIX                0x00000001
+#define GLX_PIXMAP_BIT_SGIX                0x00000002
+#define GLX_RGBA_BIT_SGIX                  0x00000001
+#define GLX_COLOR_INDEX_BIT_SGIX           0x00000002
+#define GLX_DRAWABLE_TYPE_SGIX             0x8010
+#define GLX_RENDER_TYPE_SGIX               0x8011
+#define GLX_X_RENDERABLE_SGIX              0x8012
+#define GLX_FBCONFIG_ID_SGIX               0x8013
+#define GLX_RGBA_TYPE_SGIX                 0x8014
+#define GLX_COLOR_INDEX_TYPE_SGIX          0x8015
+/* reuse GLX_SCREEN_EXT */
+#endif
+
+#ifndef GLX_SGIX_pbuffer
+#define GLX_PBUFFER_BIT_SGIX               0x00000004
+#define GLX_BUFFER_CLOBBER_MASK_SGIX       0x08000000
+#define GLX_FRONT_LEFT_BUFFER_BIT_SGIX     0x00000001
+#define GLX_FRONT_RIGHT_BUFFER_BIT_SGIX    0x00000002
+#define GLX_BACK_LEFT_BUFFER_BIT_SGIX      0x00000004
+#define GLX_BACK_RIGHT_BUFFER_BIT_SGIX     0x00000008
+#define GLX_AUX_BUFFERS_BIT_SGIX           0x00000010
+#define GLX_DEPTH_BUFFER_BIT_SGIX          0x00000020
+#define GLX_STENCIL_BUFFER_BIT_SGIX        0x00000040
+#define GLX_ACCUM_BUFFER_BIT_SGIX          0x00000080
+#define GLX_SAMPLE_BUFFERS_BIT_SGIX        0x00000100
+#define GLX_MAX_PBUFFER_WIDTH_SGIX         0x8016
+#define GLX_MAX_PBUFFER_HEIGHT_SGIX        0x8017
+#define GLX_MAX_PBUFFER_PIXELS_SGIX        0x8018
+#define GLX_OPTIMAL_PBUFFER_WIDTH_SGIX     0x8019
+#define GLX_OPTIMAL_PBUFFER_HEIGHT_SGIX    0x801A
+#define GLX_PRESERVED_CONTENTS_SGIX        0x801B
+#define GLX_LARGEST_PBUFFER_SGIX           0x801C
+#define GLX_WIDTH_SGIX                     0x801D
+#define GLX_HEIGHT_SGIX                    0x801E
+#define GLX_EVENT_MASK_SGIX                0x801F
+#define GLX_DAMAGED_SGIX                   0x8020
+#define GLX_SAVED_SGIX                     0x8021
+#define GLX_WINDOW_SGIX                    0x8022
+#define GLX_PBUFFER_SGIX                   0x8023
+#endif
+
+#ifndef GLX_SGI_cushion
+#endif
+
+#ifndef GLX_SGIX_video_resize
+#define GLX_SYNC_FRAME_SGIX                0x00000000
+#define GLX_SYNC_SWAP_SGIX                 0x00000001
+#endif
+
+#ifndef GLX_SGIX_dmbuffer
+#define GLX_DIGITAL_MEDIA_PBUFFER_SGIX     0x8024
+#endif
+
+#ifndef GLX_SGIX_swap_group
+#endif
+
+#ifndef GLX_SGIX_swap_barrier
+#endif
+
+#ifndef GLX_SGIS_blended_overlay
+#define GLX_BLENDED_RGBA_SGIS              0x8025
+#endif
+
+#ifndef GLX_SGIS_shared_multisample
+#define GLX_MULTISAMPLE_SUB_RECT_WIDTH_SGIS 0x8026
+#define GLX_MULTISAMPLE_SUB_RECT_HEIGHT_SGIS 0x8027
+#endif
+
+#ifndef GLX_SUN_get_transparent_index
+#endif
+
+#ifndef GLX_3DFX_multisample
+#define GLX_SAMPLE_BUFFERS_3DFX            0x8050
+#define GLX_SAMPLES_3DFX                   0x8051
+#endif
+
+#ifndef GLX_MESA_copy_sub_buffer
+#endif
+
+#ifndef GLX_MESA_pixmap_colormap
+#endif
+
+#ifndef GLX_MESA_release_buffers
+#endif
+
+#ifndef GLX_MESA_set_3dfx_mode
+#define GLX_3DFX_WINDOW_MODE_MESA          0x1
+#define GLX_3DFX_FULLSCREEN_MODE_MESA      0x2
+#endif
+
+#ifndef GLX_SGIX_visual_select_group
+#define GLX_VISUAL_SELECT_GROUP_SGIX       0x8028
+#endif
+
+#ifndef GLX_OML_swap_method
+#define GLX_SWAP_METHOD_OML                0x8060
+#define GLX_SWAP_EXCHANGE_OML              0x8061
+#define GLX_SWAP_COPY_OML                  0x8062
+#define GLX_SWAP_UNDEFINED_OML             0x8063
+#endif
+
+#ifndef GLX_OML_sync_control
+#endif
+
+#ifndef GLX_NV_float_buffer
+#define GLX_FLOAT_COMPONENTS_NV            0x20B0
+#endif
+
+#ifndef GLX_SGIX_hyperpipe
+#define GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX 80
+#define GLX_BAD_HYPERPIPE_CONFIG_SGIX      91
+#define GLX_BAD_HYPERPIPE_SGIX             92
+#define GLX_HYPERPIPE_DISPLAY_PIPE_SGIX    0x00000001
+#define GLX_HYPERPIPE_RENDER_PIPE_SGIX     0x00000002
+#define GLX_PIPE_RECT_SGIX                 0x00000001
+#define GLX_PIPE_RECT_LIMITS_SGIX          0x00000002
+#define GLX_HYPERPIPE_STEREO_SGIX          0x00000003
+#define GLX_HYPERPIPE_PIXEL_AVERAGE_SGIX   0x00000004
+#define GLX_HYPERPIPE_ID_SGIX              0x8030
+#endif
+
+#ifndef GLX_MESA_agp_offset
+#endif
+
+
+/*************************************************************/
+
+#ifndef GLX_ARB_get_proc_address
+/*
+ * Linux OpenGL ABI specifies glXGetProcAddressARB should be
+ * in glx.h moving related defines there as well.
+ */
+#endif
+
+#ifndef GLX_SGIX_video_source
+typedef XID GLXVideoSourceSGIX;
+#endif
+
+#ifndef GLX_SGIX_fbconfig
+typedef XID GLXFBConfigIDSGIX;
+typedef struct __GLXFBConfigRec *GLXFBConfigSGIX;
+#endif
+
+#ifndef GLX_SGIX_pbuffer
+typedef XID GLXPbufferSGIX;
+typedef struct
+{
+    int type;
+    unsigned long serial;     /* # of last request processed by server */
+    Bool send_event;          /* true if this came for SendEvent request */
+    Display *display;         /* display the event was read from */
+    GLXDrawable drawable;     /* i.d. of Drawable */
+    int event_type;       /* GLX_DAMAGED_SGIX or GLX_SAVED_SGIX */
+    int draw_type;        /* GLX_WINDOW_SGIX or GLX_PBUFFER_SGIX */
+    unsigned int mask;    /* mask indicating which buffers are affected*/
+    int x, y;
+    int width, height;
+    int count;        /* if nonzero, at least this many more */
+} GLXBufferClobberEventSGIX;
+#endif
+
+#ifndef GLX_NV_swap_group
+#endif
+
+#ifndef GLX_NV_video_out
+/*
+ * GLXVideoDeviceNV is an opaque handle to a video device (part of the
+ * GLX_NV_video_out extension).
+ */
+typedef unsigned int GLXVideoDeviceNV;
+
+/* glXBindVideoImageNV iVideoBuffer values (NV_video_out) */
+#define GLX_VIDEO_OUT_COLOR_NV           0x20C3
+#define GLX_VIDEO_OUT_ALPHA_NV           0x20C4
+#define GLX_VIDEO_OUT_DEPTH_NV           0x20C5
+#define GLX_VIDEO_OUT_COLOR_AND_ALPHA_NV 0x20C6
+#define GLX_VIDEO_OUT_COLOR_AND_DEPTH_NV 0x20C7
+
+/* glXSendPbufferToVideoNV iBufferType values (NV_video_out) */
+#define GLX_VIDEO_OUT_FRAME_NV           0x20C8
+#define GLX_VIDEO_OUT_FIELD_1_NV         0x20C9
+#define GLX_VIDEO_OUT_FIELD_2_NV         0x20CA
+#endif
+
+#ifndef GLX_EXT_texture_from_pixmap
+/* New glXGetFBConfigAttrib <attrib_list> tokens */
+#define GLX_BIND_TO_TEXTURE_RGB_EXT      0x20D0
+#define GLX_BIND_TO_TEXTURE_RGBA_EXT     0x20D1
+#define GLX_BIND_TO_MIPMAP_TEXTURE_EXT   0x20D2
+#define GLX_BIND_TO_TEXTURE_TARGETS_EXT  0x20D3
+#define GLX_Y_INVERTED_EXT               0x20D4
+
+/* New glXCreatePixmap attributes and glXQueryDrawable attributes */
+#define GLX_TEXTURE_FORMAT_EXT           0x20D5
+#define GLX_TEXTURE_TARGET_EXT           0x20D6
+#define GLX_MIPMAP_TEXTURE_EXT           0x20D7
+
+/* Values for GLX_TEXTURE_FORMAT_EXT */
+#define GLX_TEXTURE_FORMAT_NONE_EXT      0x20D8
+#define GLX_TEXTURE_FORMAT_RGB_EXT       0x20D9
+#define GLX_TEXTURE_FORMAT_RGBA_EXT      0x20DA
+
+/* Bits for GLX_BIND_TO_TEXTURE_TARGETS_EXT mask */
+#define GLX_TEXTURE_1D_BIT_EXT           0x00000001
+#define GLX_TEXTURE_2D_BIT_EXT           0x00000002
+#define GLX_TEXTURE_RECTANGLE_BIT_EXT    0x00000004
+
+/* Values for GLX_TEXTURE_TARGET_EXT */
+#define GLX_TEXTURE_1D_EXT               0x20DB
+#define GLX_TEXTURE_2D_EXT               0x20DC
+#define GLX_TEXTURE_RECTANGLE_EXT        0x20DD
+
+/*
+ * Values for the buffer parameter of glXBindTexImageEXT and
+ * glXReleaseTexImageEXT
+ */
+#define GLX_FRONT_LEFT_EXT               0x20DE
+#define GLX_FRONT_RIGHT_EXT              0x20DF
+#define GLX_BACK_LEFT_EXT                0x20E0
+#define GLX_BACK_RIGHT_EXT               0x20E1
+#define GLX_FRONT_EXT                    GLX_FRONT_LEFT_EXT
+#define GLX_BACK_EXT                     GLX_BACK_LEFT_EXT
+#define GLX_AUX0_EXT                     0x20E2
+#define GLX_AUX1_EXT                     0x20E3
+#define GLX_AUX2_EXT                     0x20E4
+#define GLX_AUX3_EXT                     0x20E5
+#define GLX_AUX4_EXT                     0x20E6
+#define GLX_AUX5_EXT                     0x20E7
+#define GLX_AUX6_EXT                     0x20E8
+#define GLX_AUX7_EXT                     0x20E9
+#define GLX_AUX8_EXT                     0x20EA
+#define GLX_AUX9_EXT                     0x20EB
+
+#endif
+
+/* Define int32_t and int64_t types for UST/MSC */
+/* (as used in the GLX_OML_sync_control extension). */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#include <inttypes.h>
+#elif defined( __VMS ) || defined(__FreeBSD__)
+#include <inttypes.h>
+#elif (defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SVR4))
+#include <inttypes.h>
+#elif defined(__SCO__) || defined(__USLC__) || defined(__linux__)
+#include <stdint.h>
+#elif defined(__UNIXOS2__) || defined(__SOL64__)
+typedef long int int32_t;
+typedef long long int int64_t;
+#else
+#error "int32_t and int64_t are undefined!"
+#endif
+
+#ifndef GLX_ARB_get_proc_address
+/* Moved to glx.h */
+#endif
+
+#ifndef GLX_ARB_multisample
+#define GLX_ARB_multisample 1
+#endif
+
+#ifndef GLX_ARB_fbconfig_float
+#define GLX_ARB_fbconfig_float 1
+#endif
+
+#ifndef GLX_SGIS_multisample
+#define GLX_SGIS_multisample 1
+#endif
+
+#ifndef GLX_EXT_visual_info
+#define GLX_EXT_visual_info 1
+#endif
+
+#ifndef GLX_SGI_swap_control
+#define GLX_SGI_swap_control 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern int glXSwapIntervalSGI(int);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef int (* PFNGLXSWAPINTERVALSGIPROC)(int interval);
+#endif
+
+#ifndef GLX_SGI_video_sync
+#define GLX_SGI_video_sync 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern int glXGetVideoSyncSGI(unsigned int *);
+extern int glXWaitVideoSyncSGI(int, int, unsigned int *);
+extern int glXGetRefreshRateSGI(unsigned int *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef int (* PFNGLXGETVIDEOSYNCSGIPROC)(unsigned int *count);
+typedef int (* PFNGLXWAITVIDEOSYNCSGIPROC)(int divisor, int remainder, unsigned int *count);
+typedef int (* PFNGLXGETREFRESHRATESGIPROC)(unsigned int *);
+#endif
+
+#ifndef GLX_SGI_make_current_read
+#define GLX_SGI_make_current_read 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXMakeCurrentReadSGI(Display *, GLXDrawable, GLXDrawable, GLXContext);
+extern GLXDrawable glXGetCurrentReadDrawableSGI(void);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Bool(* PFNGLXMAKECURRENTREADSGIPROC)(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx);
+typedef GLXDrawable(* PFNGLXGETCURRENTREADDRAWABLESGIPROC)(void);
+#endif
+
+#ifndef GLX_SGIX_video_source
+#define GLX_SGIX_video_source 1
+#ifdef _VL_H
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern GLXVideoSourceSGIX glXCreateGLXVideoSourceSGIX(Display *, int, VLServer, VLPath, int, VLNode);
+extern void glXDestroyGLXVideoSourceSGIX(Display *, GLXVideoSourceSGIX);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef GLXVideoSourceSGIX(* PFNGLXCREATEGLXVIDEOSOURCESGIXPROC)(Display *display, int screen, VLServer server, VLPath path, int nodeClass, VLNode drainNode);
+typedef void (* PFNGLXDESTROYGLXVIDEOSOURCESGIXPROC)(Display *dpy, GLXVideoSourceSGIX glxvideosource);
+#endif /* _VL_H */
+#endif
+
+#ifndef GLX_EXT_visual_rating
+#define GLX_EXT_visual_rating 1
+#endif
+
+#ifndef GLX_EXT_import_context
+#define GLX_EXT_import_context 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Display *glXGetCurrentDisplayEXT(void);
+extern int glXQueryContextInfoEXT(Display *, GLXContext, int, int *);
+extern GLXContextID glXGetContextIDEXT(const GLXContext);
+extern GLXContext glXImportContextEXT(Display *, GLXContextID);
+extern void glXFreeContextEXT(Display *, GLXContext);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Display *(* PFNGLXGETCURRENTDISPLAYEXTPROC)(void);
+typedef int (* PFNGLXQUERYCONTEXTINFOEXTPROC)(Display *dpy, GLXContext context, int attribute, int *value);
+typedef GLXContextID(* PFNGLXGETCONTEXTIDEXTPROC)(const GLXContext context);
+typedef GLXContext(* PFNGLXIMPORTCONTEXTEXTPROC)(Display *dpy, GLXContextID contextID);
+typedef void (* PFNGLXFREECONTEXTEXTPROC)(Display *dpy, GLXContext context);
+#endif
+
+#ifndef GLX_SGIX_fbconfig
+#define GLX_SGIX_fbconfig 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern int glXGetFBConfigAttribSGIX(Display *, GLXFBConfigSGIX, int, int *);
+extern GLXFBConfigSGIX *glXChooseFBConfigSGIX(Display *, int, int *, int *);
+extern GLXPixmap glXCreateGLXPixmapWithConfigSGIX(Display *, GLXFBConfigSGIX, Pixmap);
+extern GLXContext glXCreateContextWithConfigSGIX(Display *, GLXFBConfigSGIX, int, GLXContext, Bool);
+extern XVisualInfo *glXGetVisualFromFBConfigSGIX(Display *, GLXFBConfigSGIX);
+extern GLXFBConfigSGIX glXGetFBConfigFromVisualSGIX(Display *, XVisualInfo *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef int (* PFNGLXGETFBCONFIGATTRIBSGIXPROC)(Display *dpy, GLXFBConfigSGIX config, int attribute, int *value);
+typedef GLXFBConfigSGIX *(* PFNGLXCHOOSEFBCONFIGSGIXPROC)(Display *dpy, int screen, int *attrib_list, int *nelements);
+typedef GLXPixmap(* PFNGLXCREATEGLXPIXMAPWITHCONFIGSGIXPROC)(Display *dpy, GLXFBConfigSGIX config, Pixmap pixmap);
+typedef GLXContext(* PFNGLXCREATECONTEXTWITHCONFIGSGIXPROC)(Display *dpy, GLXFBConfigSGIX config, int render_type, GLXContext share_list, Bool direct);
+typedef XVisualInfo *(* PFNGLXGETVISUALFROMFBCONFIGSGIXPROC)(Display *dpy, GLXFBConfigSGIX config);
+typedef GLXFBConfigSGIX(* PFNGLXGETFBCONFIGFROMVISUALSGIXPROC)(Display *dpy, XVisualInfo *vis);
+#endif
+
+#ifndef GLX_SGIX_pbuffer
+#define GLX_SGIX_pbuffer 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern GLXPbufferSGIX glXCreateGLXPbufferSGIX(Display *, GLXFBConfigSGIX, unsigned int, unsigned int, int *);
+extern void glXDestroyGLXPbufferSGIX(Display *, GLXPbufferSGIX);
+extern int glXQueryGLXPbufferSGIX(Display *, GLXPbufferSGIX, int, unsigned int *);
+extern void glXSelectEventSGIX(Display *, GLXDrawable, unsigned long);
+extern void glXGetSelectedEventSGIX(Display *, GLXDrawable, unsigned long *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef GLXPbufferSGIX(* PFNGLXCREATEGLXPBUFFERSGIXPROC)(Display *dpy, GLXFBConfigSGIX config, unsigned int width, unsigned int height, int *attrib_list);
+typedef void (* PFNGLXDESTROYGLXPBUFFERSGIXPROC)(Display *dpy, GLXPbufferSGIX pbuf);
+typedef int (* PFNGLXQUERYGLXPBUFFERSGIXPROC)(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value);
+typedef void (* PFNGLXSELECTEVENTSGIXPROC)(Display *dpy, GLXDrawable drawable, unsigned long mask);
+typedef void (* PFNGLXGETSELECTEDEVENTSGIXPROC)(Display *dpy, GLXDrawable drawable, unsigned long *mask);
+#endif
+
+#ifndef GLX_SGI_cushion
+#define GLX_SGI_cushion 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void glXCushionSGI(Display *, Window, float);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef void (* PFNGLXCUSHIONSGIPROC)(Display *dpy, Window window, float cushion);
+#endif
+
+#ifndef GLX_SGIX_video_resize
+#define GLX_SGIX_video_resize 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern int glXBindChannelToWindowSGIX(Display *, int, int, Window);
+extern int glXChannelRectSGIX(Display *, int, int, int, int, int, int);
+extern int glXQueryChannelRectSGIX(Display *, int, int, int *, int *, int *, int *);
+extern int glXQueryChannelDeltasSGIX(Display *, int, int, int *, int *, int *, int *);
+extern int glXChannelRectSyncSGIX(Display *, int, int, GLenum);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef int (* PFNGLXBINDCHANNELTOWINDOWSGIXPROC)(Display *display, int screen, int channel, Window window);
+typedef int (* PFNGLXCHANNELRECTSGIXPROC)(Display *display, int screen, int channel, int x, int y, int w, int h);
+typedef int (* PFNGLXQUERYCHANNELRECTSGIXPROC)(Display *display, int screen, int channel, int *dx, int *dy, int *dw, int *dh);
+typedef int (* PFNGLXQUERYCHANNELDELTASSGIXPROC)(Display *display, int screen, int channel, int *x, int *y, int *w, int *h);
+typedef int (* PFNGLXCHANNELRECTSYNCSGIXPROC)(Display *display, int screen, int channel, GLenum synctype);
+#endif
+
+#ifndef GLX_SGIX_dmbuffer
+#define GLX_SGIX_dmbuffer 1
+#ifdef _DM_BUFFER_H_
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXAssociateDMPbufferSGIX(Display *, GLXPbufferSGIX, DMparams *, DMbuffer);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Bool(* PFNGLXASSOCIATEDMPBUFFERSGIXPROC)(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params, DMbuffer dmbuffer);
+#endif /* _DM_BUFFER_H_ */
+#endif
+
+#ifndef GLX_SGIX_swap_group
+#define GLX_SGIX_swap_group 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void glXJoinSwapGroupSGIX(Display *, GLXDrawable, GLXDrawable);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef void (* PFNGLXJOINSWAPGROUPSGIXPROC)(Display *dpy, GLXDrawable drawable, GLXDrawable member);
+#endif
+
+#ifndef GLX_SGIX_swap_barrier
+#define GLX_SGIX_swap_barrier 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void glXBindSwapBarrierSGIX(Display *, GLXDrawable, int);
+extern Bool glXQueryMaxSwapBarriersSGIX(Display *, int, int *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef void (* PFNGLXBINDSWAPBARRIERSGIXPROC)(Display *dpy, GLXDrawable drawable, int barrier);
+typedef Bool(* PFNGLXQUERYMAXSWAPBARRIERSSGIXPROC)(Display *dpy, int screen, int *max);
+#endif
+
+#ifndef GLX_SUN_get_transparent_index
+#define GLX_SUN_get_transparent_index 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Status glXGetTransparentIndexSUN(Display *, Window, Window, long *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Status(* PFNGLXGETTRANSPARENTINDEXSUNPROC)(Display *dpy, Window overlay, Window underlay, long *pTransparentIndex);
+#endif
+
+#ifndef GLX_MESA_copy_sub_buffer
+#define GLX_MESA_copy_sub_buffer 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void glXCopySubBufferMESA(Display *, GLXDrawable, int, int, int, int);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef void (* PFNGLXCOPYSUBBUFFERMESAPROC)(Display *dpy, GLXDrawable drawable, int x, int y, int width, int height);
+#endif
+
+#ifndef GLX_MESA_pixmap_colormap
+#define GLX_MESA_pixmap_colormap 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern GLXPixmap glXCreateGLXPixmapMESA(Display *, XVisualInfo *, Pixmap, Colormap);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef GLXPixmap(* PFNGLXCREATEGLXPIXMAPMESAPROC)(Display *dpy, XVisualInfo *visual, Pixmap pixmap, Colormap cmap);
+#endif
+
+#ifndef GLX_MESA_release_buffers
+#define GLX_MESA_release_buffers 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXReleaseBuffersMESA(Display *, GLXDrawable);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Bool(* PFNGLXRELEASEBUFFERSMESAPROC)(Display *dpy, GLXDrawable drawable);
+#endif
+
+#ifndef GLX_MESA_set_3dfx_mode
+#define GLX_MESA_set_3dfx_mode 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXSet3DfxModeMESA(int);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Bool(* PFNGLXSET3DFXMODEMESAPROC)(int mode);
+#endif
+
+#ifndef GLX_SGIX_visual_select_group
+#define GLX_SGIX_visual_select_group 1
+#endif
+
+#ifndef GLX_OML_swap_method
+#define GLX_OML_swap_method 1
+#endif
+
+#ifndef GLX_OML_sync_control
+#define GLX_OML_sync_control 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXGetSyncValuesOML(Display *, GLXDrawable, int64_t *, int64_t *, int64_t *);
+extern Bool glXGetMscRateOML(Display *, GLXDrawable, int32_t *, int32_t *);
+extern int64_t glXSwapBuffersMscOML(Display *, GLXDrawable, int64_t, int64_t, int64_t);
+extern Bool glXWaitForMscOML(Display *, GLXDrawable, int64_t, int64_t, int64_t, int64_t *, int64_t *, int64_t *);
+extern Bool glXWaitForSbcOML(Display *, GLXDrawable, int64_t, int64_t *, int64_t *, int64_t *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef Bool(* PFNGLXGETSYNCVALUESOMLPROC)(Display *dpy, GLXDrawable drawable, int64_t *ust, int64_t *msc, int64_t *sbc);
+typedef Bool(* PFNGLXGETMSCRATEOMLPROC)(Display *dpy, GLXDrawable drawable, int32_t *numerator, int32_t *denominator);
+typedef int64_t (* PFNGLXSWAPBUFFERSMSCOMLPROC)(Display *dpy, GLXDrawable drawable, int64_t target_msc, int64_t divisor, int64_t remainder);
+typedef Bool(* PFNGLXWAITFORMSCOMLPROC)(Display *dpy, GLXDrawable drawable, int64_t target_msc, int64_t divisor, int64_t remainder, int64_t *ust, int64_t *msc, int64_t *sbc);
+typedef Bool(* PFNGLXWAITFORSBCOMLPROC)(Display *dpy, GLXDrawable drawable, int64_t target_sbc, int64_t *ust, int64_t *msc, int64_t *sbc);
+#endif
+
+#ifndef GLX_NV_float_buffer
+#define GLX_NV_float_buffer 1
+#endif
+
+#ifndef GLX_SGIX_hyperpipe
+#define GLX_SGIX_hyperpipe 1
+
+typedef struct
+{
+    char    pipeName[GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX];
+    int     networkId;
+} GLXHyperpipeNetworkSGIX;
+
+typedef struct
+{
+    char    pipeName[GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX];
+    int     channel;
+    unsigned int
+    participationType;
+    int     timeSlice;
+} GLXHyperpipeConfigSGIX;
+
+typedef struct
+{
+    char pipeName[GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX];
+    int srcXOrigin, srcYOrigin, srcWidth, srcHeight;
+    int destXOrigin, destYOrigin, destWidth, destHeight;
+} GLXPipeRect;
+
+typedef struct
+{
+    char pipeName[GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX];
+    int XOrigin, YOrigin, maxHeight, maxWidth;
+} GLXPipeRectLimits;
+
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern GLXHyperpipeNetworkSGIX *glXQueryHyperpipeNetworkSGIX(Display *, int *);
+extern int glXHyperpipeConfigSGIX(Display *, int, int, GLXHyperpipeConfigSGIX *, int *);
+extern GLXHyperpipeConfigSGIX *glXQueryHyperpipeConfigSGIX(Display *, int, int *);
+extern int glXDestroyHyperpipeConfigSGIX(Display *, int);
+extern int glXBindHyperpipeSGIX(Display *, int);
+extern int glXQueryHyperpipeBestAttribSGIX(Display *, int, int, int, void *, void *);
+extern int glXHyperpipeAttribSGIX(Display *, int, int, int, void *);
+extern int glXQueryHyperpipeAttribSGIX(Display *, int, int, int, void *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef GLXHyperpipeNetworkSGIX *(* PFNGLXQUERYHYPERPIPENETWORKSGIXPROC)(Display *dpy, int *npipes);
+typedef int (* PFNGLXHYPERPIPECONFIGSGIXPROC)(Display *dpy, int networkId, int npipes, GLXHyperpipeConfigSGIX *cfg, int *hpId);
+typedef GLXHyperpipeConfigSGIX *(* PFNGLXQUERYHYPERPIPECONFIGSGIXPROC)(Display *dpy, int hpId, int *npipes);
+typedef int (* PFNGLXDESTROYHYPERPIPECONFIGSGIXPROC)(Display *dpy, int hpId);
+typedef int (* PFNGLXBINDHYPERPIPESGIXPROC)(Display *dpy, int hpId);
+typedef int (* PFNGLXQUERYHYPERPIPEBESTATTRIBSGIXPROC)(Display *dpy, int timeSlice, int attrib, int size, void *attribList, void *returnAttribList);
+typedef int (* PFNGLXHYPERPIPEATTRIBSGIXPROC)(Display *dpy, int timeSlice, int attrib, int size, void *attribList);
+typedef int (* PFNGLXQUERYHYPERPIPEATTRIBSGIXPROC)(Display *dpy, int timeSlice, int attrib, int size, void *returnAttribList);
+#endif
+
+#ifndef GLX_MESA_agp_offset
+#define GLX_MESA_agp_offset 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern unsigned int glXGetAGPOffsetMESA(const void *);
+#endif /* GLX_GLXEXT_PROTOTYPES */
+typedef unsigned int (* PFNGLXGETAGPOFFSETMESAPROC)(const void *pointer);
+#endif
+
+/*
+ * GLX_NV_vertex_array_range is not a real extension name...
+ */
+#ifndef GLX_NV_vertex_array_range
+#define GLX_NV_vertex_array_range 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void *glXAllocateMemoryNV(GLsizei size, GLfloat readfreq,
+                                 GLfloat writefreq, GLfloat priority);
+
+extern void glXFreeMemoryNV(GLvoid *pointer);
+#endif
+typedef void *(* PFNGLXALLOCATEMEMORYNVPROC)(GLsizei size,
+                                             GLfloat readfreq,
+                                             GLfloat writefreq,
+                                             GLfloat priority);
+
+typedef void (* PFNGLXFREEMEMORYNVPROC)(GLvoid *pointer);
+#endif
+
+#ifndef GLX_NV_swap_group
+#define GLX_NV_swap_group 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern Bool glXJoinSwapGroupNV(Display *dpy, GLXDrawable drawable,
+                               GLuint group);
+
+extern Bool glXBindSwapBarrierNV(Display *dpy, GLuint group, GLuint barrier);
+
+extern Bool glXQuerySwapGroupNV(Display *dpy, GLXDrawable drawable,
+                                GLuint *group, GLuint *barrier);
+
+extern Bool glXQueryMaxSwapGroupsNV(Display *dpy, int screen,
+                                    GLuint *maxGroups, GLuint *maxBarriers);
+
+extern Bool glXQueryFrameCountNV(Display *dpy, int screen, GLuint *count);
+
+extern Bool glXResetFrameCountNV(Display *dpy, int screen);
+#endif
+typedef Bool(* PFNGLXJOINSWAPGROUPNVPROC)(Display *dpy,
+                                          GLXDrawable drawable,
+                                          GLuint group);
+
+typedef Bool(* PFNGLXBINDSWAPBARRIERNVPROC)(Display *dpy,
+                                            GLuint group,
+                                            GLuint barrier);
+
+typedef Bool(* PFNGLXQUERYSWAPGROUPNVPROC)(Display *dpy,
+                                           GLXDrawable drawable,
+                                           GLuint *group,
+                                           GLuint *barrier);
+
+typedef Bool(* PFNGLXQUERYMAXSWAPGROUPSNVPROC)(Display *dpy,
+                                               int screen,
+                                               GLuint *maxGroups,
+                                               GLuint *maxBarriers);
+
+typedef Bool(* PFNGLXQUERYFRAMECOUNTNVPROC)(Display *dpy,
+                                            int screen,
+                                            GLuint *count);
+
+typedef Bool(* PFNGLXRESETFRAMECOUNTNVPROC)(Display *dpy, int screen);
+#endif
+
+#ifndef GLX_NV_video_out
+#define GLX_NV_video_out 1
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern int glXGetVideoDeviceNV(Display *dpy, int screen, int numVideoDevices,
+                               GLXVideoDeviceNV *pVideoDevice);
+
+extern int glXReleaseVideoDeviceNV(Display *dpy, int screen,
+                                   GLXVideoDeviceNV VideoDevice);
+
+extern int glXBindVideoImageNV(Display *dpy, GLXVideoDeviceNV VideoDevice,
+                               GLXPbuffer pbuf, int iVideoBuffer);
+
+extern int glXReleaseVideoImageNV(Display *dpy, GLXPbuffer pbuf);
+
+extern int glXSendPbufferToVideoNV(Display *dpy, GLXPbuffer pbuf,
+                                   int iBufferType,
+                                   unsigned long *pulCounterPbuffer,
+                                   GLboolean bBlock);
+
+extern int glXGetVideoInfoNV(Display *dpy, int screen,
+                             GLXVideoDeviceNV VideoDevice,
+                             unsigned long *pulCounterOutputVideo,
+                             unsigned long *pulCounterOutputPbuffer);
+#endif
+typedef int (* PFNGLXGETVIDEODEVICENVPROC)(Display *dpy,
+                                           int screen,
+                                           int numVideoDevices,
+                                           GLXVideoDeviceNV *pVideoDevice);
+
+typedef int (* PFNGLXRELEASEVIDEODEVICENVPROC)(Display *dpy,
+                                               int screen,
+                                               GLXVideoDeviceNV VideoDevice);
+
+typedef int (* PFNGLXBINDVIDEOIMAGENVPROC)(Display *dpy,
+                                           GLXVideoDeviceNV VideoDevice,
+                                           GLXPbuffer pbuf,
+                                           int iVideoBuffer);
+
+typedef int (* PFNGLXRELEASEVIDEOIMAGENVPROC)(Display *dpy,
+                                              GLXPbuffer pbuf);
+
+typedef int (* PFNGLXSENDPBUFFERTOVIDEONVPROC)(Display *dpy,
+                                               GLXPbuffer pbuf,
+                                               int iBufferType,
+                                               unsigned long *pulCounterPbuffer,
+                                               GLboolean bBlock);
+
+typedef int (* PFNGLXGETVIDEOINFONVPROC)(Display *dpy, int screen,
+                                         GLXVideoDeviceNV VideoDevice,
+                                         unsigned long *pulCounterOutputVideo,
+                                         unsigned long *pulCounterOutputPbuffer);
+#endif
+
+#ifndef GLX_EXT_texture_from_pixmap
+#define GLX_EXT_texture_from_pixmap
+#ifdef GLX_GLXEXT_PROTOTYPES
+extern void glXBindTexImageEXT(Display *dpy, GLXDrawable drawable,
+                               int buffer, const int *attrib_list);
+extern void glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable,
+                                  int buffer);
+#endif
+typedef void (* PFNGLXBINDTEXIMAGEEXTPROC)(Display *dpy,
+                                           GLXDrawable drawable,
+                                           int buffer,
+                                           const int *attrib_list);
+typedef void (* PFNGLXRELEASETEXIMAGEEXTPROC)(Display *dpy,
+                                              GLXDrawable drawable,
+                                              int buffer);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 958 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/wglew.h

@@ -0,0 +1,958 @@
+/*
+** The OpenGL Extension Wrangler Library
+** Copyright (C) 2002-2006, Milan Ikits <milan ikits[]ieee org>
+** Copyright (C) 2002-2006, Marcelo E. Magallon <mmagallo[]debian org>
+** Copyright (C) 2002, Lev Povalahev
+** All rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without
+** modification, are permitted provided that the following conditions are met:
+**
+** * Redistributions of source code must retain the above copyright notice,
+**   this list of conditions and the following disclaimer.
+** * Redistributions in binary form must reproduce the above copyright notice,
+**   this list of conditions and the following disclaimer in the documentation
+**   and/or other materials provided with the distribution.
+** * The name of the author may be used to endorse or promote products
+**   derived from this software without specific prior written permission.
+**
+** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+** THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __wglew_h__
+#define __wglew_h__
+#define __WGLEW_H__
+
+#ifdef __wglext_h_
+#error wglext.h included before wglew.h
+#endif
+
+#define __wglext_h_
+
+#if !defined(APIENTRY) && !defined(__CYGWIN__)
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN 1
+#  endif
+#include <windows.h>
+#endif
+
+/*
+ * GLEW_STATIC needs to be set when using the static version.
+ * GLEW_BUILD is set when building the DLL version.
+ */
+#ifdef GLEW_STATIC
+#  define GLEWAPI extern
+#else
+#  ifdef GLEW_BUILD
+#    define GLEWAPI extern __declspec(dllexport)
+#  else
+#    define GLEWAPI extern __declspec(dllimport)
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* -------------------------- WGL_3DFX_multisample ------------------------- */
+
+#ifndef WGL_3DFX_multisample
+#define WGL_3DFX_multisample 1
+
+#define WGL_SAMPLE_BUFFERS_3DFX 0x2060
+#define WGL_SAMPLES_3DFX 0x2061
+
+#define WGLEW_3DFX_multisample WGLEW_GET_VAR(__WGLEW_3DFX_multisample)
+
+#endif /* WGL_3DFX_multisample */
+
+/* ------------------------- WGL_3DL_stereo_control ------------------------ */
+
+#ifndef WGL_3DL_stereo_control
+#define WGL_3DL_stereo_control 1
+
+#define WGL_STEREO_EMITTER_ENABLE_3DL 0x2055
+#define WGL_STEREO_EMITTER_DISABLE_3DL 0x2056
+#define WGL_STEREO_POLARITY_NORMAL_3DL 0x2057
+#define WGL_STEREO_POLARITY_INVERT_3DL 0x2058
+
+typedef BOOL (WINAPI *PFNWGLSETSTEREOEMITTERSTATE3DLPROC)(HDC hDC, UINT uState);
+
+#define wglSetStereoEmitterState3DL WGLEW_GET_FUN(__wglewSetStereoEmitterState3DL)
+
+#define WGLEW_3DL_stereo_control WGLEW_GET_VAR(__WGLEW_3DL_stereo_control)
+
+#endif /* WGL_3DL_stereo_control */
+
+/* ------------------------- WGL_ARB_buffer_region ------------------------- */
+
+#ifndef WGL_ARB_buffer_region
+#define WGL_ARB_buffer_region 1
+
+#define WGL_FRONT_COLOR_BUFFER_BIT_ARB 0x00000001
+#define WGL_BACK_COLOR_BUFFER_BIT_ARB 0x00000002
+#define WGL_DEPTH_BUFFER_BIT_ARB 0x00000004
+#define WGL_STENCIL_BUFFER_BIT_ARB 0x00000008
+
+typedef HANDLE(WINAPI *PFNWGLCREATEBUFFERREGIONARBPROC)(HDC hDC, int iLayerPlane, UINT uType);
+typedef VOID (WINAPI *PFNWGLDELETEBUFFERREGIONARBPROC)(HANDLE hRegion);
+typedef BOOL (WINAPI *PFNWGLRESTOREBUFFERREGIONARBPROC)(HANDLE hRegion, int x, int y, int width, int height, int xSrc, int ySrc);
+typedef BOOL (WINAPI *PFNWGLSAVEBUFFERREGIONARBPROC)(HANDLE hRegion, int x, int y, int width, int height);
+
+#define wglCreateBufferRegionARB WGLEW_GET_FUN(__wglewCreateBufferRegionARB)
+#define wglDeleteBufferRegionARB WGLEW_GET_FUN(__wglewDeleteBufferRegionARB)
+#define wglRestoreBufferRegionARB WGLEW_GET_FUN(__wglewRestoreBufferRegionARB)
+#define wglSaveBufferRegionARB WGLEW_GET_FUN(__wglewSaveBufferRegionARB)
+
+#define WGLEW_ARB_buffer_region WGLEW_GET_VAR(__WGLEW_ARB_buffer_region)
+
+#endif /* WGL_ARB_buffer_region */
+
+/* ----------------------- WGL_ARB_extensions_string ----------------------- */
+
+#ifndef WGL_ARB_extensions_string
+#define WGL_ARB_extensions_string 1
+
+typedef const char *(WINAPI *PFNWGLGETEXTENSIONSSTRINGARBPROC)(HDC hdc);
+
+#define wglGetExtensionsStringARB WGLEW_GET_FUN(__wglewGetExtensionsStringARB)
+
+#define WGLEW_ARB_extensions_string WGLEW_GET_VAR(__WGLEW_ARB_extensions_string)
+
+#endif /* WGL_ARB_extensions_string */
+
+/* ----------------------- WGL_ARB_make_current_read ----------------------- */
+
+#ifndef WGL_ARB_make_current_read
+#define WGL_ARB_make_current_read 1
+
+typedef HDC(WINAPI *PFNWGLGETCURRENTREADDCARBPROC)(VOID);
+typedef BOOL (WINAPI *PFNWGLMAKECONTEXTCURRENTARBPROC)(HDC hDrawDC, HDC hReadDC, HGLRC hglrc);
+
+#define wglGetCurrentReadDCARB WGLEW_GET_FUN(__wglewGetCurrentReadDCARB)
+#define wglMakeContextCurrentARB WGLEW_GET_FUN(__wglewMakeContextCurrentARB)
+
+#define WGLEW_ARB_make_current_read WGLEW_GET_VAR(__WGLEW_ARB_make_current_read)
+
+#endif /* WGL_ARB_make_current_read */
+
+/* -------------------------- WGL_ARB_multisample -------------------------- */
+
+#ifndef WGL_ARB_multisample
+#define WGL_ARB_multisample 1
+
+#define WGL_SAMPLE_BUFFERS_ARB 0x2041
+#define WGL_SAMPLES_ARB 0x2042
+
+#define WGLEW_ARB_multisample WGLEW_GET_VAR(__WGLEW_ARB_multisample)
+
+#endif /* WGL_ARB_multisample */
+
+/* ---------------------------- WGL_ARB_pbuffer ---------------------------- */
+
+#ifndef WGL_ARB_pbuffer
+#define WGL_ARB_pbuffer 1
+
+#define WGL_DRAW_TO_PBUFFER_ARB 0x202D
+#define WGL_MAX_PBUFFER_PIXELS_ARB 0x202E
+#define WGL_MAX_PBUFFER_WIDTH_ARB 0x202F
+#define WGL_MAX_PBUFFER_HEIGHT_ARB 0x2030
+#define WGL_PBUFFER_LARGEST_ARB 0x2033
+#define WGL_PBUFFER_WIDTH_ARB 0x2034
+#define WGL_PBUFFER_HEIGHT_ARB 0x2035
+#define WGL_PBUFFER_LOST_ARB 0x2036
+
+DECLARE_HANDLE(HPBUFFERARB);
+
+typedef HPBUFFERARB(WINAPI *PFNWGLCREATEPBUFFERARBPROC)(HDC hDC, int iPixelFormat, int iWidth, int iHeight, const int *piAttribList);
+typedef BOOL (WINAPI *PFNWGLDESTROYPBUFFERARBPROC)(HPBUFFERARB hPbuffer);
+typedef HDC(WINAPI *PFNWGLGETPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer);
+typedef BOOL (WINAPI *PFNWGLQUERYPBUFFERARBPROC)(HPBUFFERARB hPbuffer, int iAttribute, int *piValue);
+typedef int (WINAPI *PFNWGLRELEASEPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer, HDC hDC);
+
+#define wglCreatePbufferARB WGLEW_GET_FUN(__wglewCreatePbufferARB)
+#define wglDestroyPbufferARB WGLEW_GET_FUN(__wglewDestroyPbufferARB)
+#define wglGetPbufferDCARB WGLEW_GET_FUN(__wglewGetPbufferDCARB)
+#define wglQueryPbufferARB WGLEW_GET_FUN(__wglewQueryPbufferARB)
+#define wglReleasePbufferDCARB WGLEW_GET_FUN(__wglewReleasePbufferDCARB)
+
+#define WGLEW_ARB_pbuffer WGLEW_GET_VAR(__WGLEW_ARB_pbuffer)
+
+#endif /* WGL_ARB_pbuffer */
+
+/* -------------------------- WGL_ARB_pixel_format ------------------------- */
+
+#ifndef WGL_ARB_pixel_format
+#define WGL_ARB_pixel_format 1
+
+#define WGL_NUMBER_PIXEL_FORMATS_ARB 0x2000
+#define WGL_DRAW_TO_WINDOW_ARB 0x2001
+#define WGL_DRAW_TO_BITMAP_ARB 0x2002
+#define WGL_ACCELERATION_ARB 0x2003
+#define WGL_NEED_PALETTE_ARB 0x2004
+#define WGL_NEED_SYSTEM_PALETTE_ARB 0x2005
+#define WGL_SWAP_LAYER_BUFFERS_ARB 0x2006
+#define WGL_SWAP_METHOD_ARB 0x2007
+#define WGL_NUMBER_OVERLAYS_ARB 0x2008
+#define WGL_NUMBER_UNDERLAYS_ARB 0x2009
+#define WGL_TRANSPARENT_ARB 0x200A
+#define WGL_SHARE_DEPTH_ARB 0x200C
+#define WGL_SHARE_STENCIL_ARB 0x200D
+#define WGL_SHARE_ACCUM_ARB 0x200E
+#define WGL_SUPPORT_GDI_ARB 0x200F
+#define WGL_SUPPORT_OPENGL_ARB 0x2010
+#define WGL_DOUBLE_BUFFER_ARB 0x2011
+#define WGL_STEREO_ARB 0x2012
+#define WGL_PIXEL_TYPE_ARB 0x2013
+#define WGL_COLOR_BITS_ARB 0x2014
+#define WGL_RED_BITS_ARB 0x2015
+#define WGL_RED_SHIFT_ARB 0x2016
+#define WGL_GREEN_BITS_ARB 0x2017
+#define WGL_GREEN_SHIFT_ARB 0x2018
+#define WGL_BLUE_BITS_ARB 0x2019
+#define WGL_BLUE_SHIFT_ARB 0x201A
+#define WGL_ALPHA_BITS_ARB 0x201B
+#define WGL_ALPHA_SHIFT_ARB 0x201C
+#define WGL_ACCUM_BITS_ARB 0x201D
+#define WGL_ACCUM_RED_BITS_ARB 0x201E
+#define WGL_ACCUM_GREEN_BITS_ARB 0x201F
+#define WGL_ACCUM_BLUE_BITS_ARB 0x2020
+#define WGL_ACCUM_ALPHA_BITS_ARB 0x2021
+#define WGL_DEPTH_BITS_ARB 0x2022
+#define WGL_STENCIL_BITS_ARB 0x2023
+#define WGL_AUX_BUFFERS_ARB 0x2024
+#define WGL_NO_ACCELERATION_ARB 0x2025
+#define WGL_GENERIC_ACCELERATION_ARB 0x2026
+#define WGL_FULL_ACCELERATION_ARB 0x2027
+#define WGL_SWAP_EXCHANGE_ARB 0x2028
+#define WGL_SWAP_COPY_ARB 0x2029
+#define WGL_SWAP_UNDEFINED_ARB 0x202A
+#define WGL_TYPE_RGBA_ARB 0x202B
+#define WGL_TYPE_COLORINDEX_ARB 0x202C
+#define WGL_TRANSPARENT_RED_VALUE_ARB 0x2037
+#define WGL_TRANSPARENT_GREEN_VALUE_ARB 0x2038
+#define WGL_TRANSPARENT_BLUE_VALUE_ARB 0x2039
+#define WGL_TRANSPARENT_ALPHA_VALUE_ARB 0x203A
+#define WGL_TRANSPARENT_INDEX_VALUE_ARB 0x203B
+
+typedef BOOL (WINAPI *PFNWGLCHOOSEPIXELFORMATARBPROC)(HDC hdc, const int *piAttribIList, const FLOAT *pfAttribFList, UINT nMaxFormats, int *piFormats, UINT *nNumFormats);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBFVARBPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, FLOAT *pfValues);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBIVARBPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, int *piValues);
+
+#define wglChoosePixelFormatARB WGLEW_GET_FUN(__wglewChoosePixelFormatARB)
+#define wglGetPixelFormatAttribfvARB WGLEW_GET_FUN(__wglewGetPixelFormatAttribfvARB)
+#define wglGetPixelFormatAttribivARB WGLEW_GET_FUN(__wglewGetPixelFormatAttribivARB)
+
+#define WGLEW_ARB_pixel_format WGLEW_GET_VAR(__WGLEW_ARB_pixel_format)
+
+#endif /* WGL_ARB_pixel_format */
+
+/* ----------------------- WGL_ARB_pixel_format_float ---------------------- */
+
+#ifndef WGL_ARB_pixel_format_float
+#define WGL_ARB_pixel_format_float 1
+
+#define WGL_TYPE_RGBA_FLOAT_ARB 0x21A0
+
+#define WGLEW_ARB_pixel_format_float WGLEW_GET_VAR(__WGLEW_ARB_pixel_format_float)
+
+#endif /* WGL_ARB_pixel_format_float */
+
+/* ------------------------- WGL_ARB_render_texture ------------------------ */
+
+#ifndef WGL_ARB_render_texture
+#define WGL_ARB_render_texture 1
+
+#define WGL_BIND_TO_TEXTURE_RGB_ARB 0x2070
+#define WGL_BIND_TO_TEXTURE_RGBA_ARB 0x2071
+#define WGL_TEXTURE_FORMAT_ARB 0x2072
+#define WGL_TEXTURE_TARGET_ARB 0x2073
+#define WGL_MIPMAP_TEXTURE_ARB 0x2074
+#define WGL_TEXTURE_RGB_ARB 0x2075
+#define WGL_TEXTURE_RGBA_ARB 0x2076
+#define WGL_NO_TEXTURE_ARB 0x2077
+#define WGL_TEXTURE_CUBE_MAP_ARB 0x2078
+#define WGL_TEXTURE_1D_ARB 0x2079
+#define WGL_TEXTURE_2D_ARB 0x207A
+#define WGL_MIPMAP_LEVEL_ARB 0x207B
+#define WGL_CUBE_MAP_FACE_ARB 0x207C
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB 0x207D
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB 0x207E
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB 0x207F
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB 0x2080
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB 0x2081
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB 0x2082
+#define WGL_FRONT_LEFT_ARB 0x2083
+#define WGL_FRONT_RIGHT_ARB 0x2084
+#define WGL_BACK_LEFT_ARB 0x2085
+#define WGL_BACK_RIGHT_ARB 0x2086
+#define WGL_AUX0_ARB 0x2087
+#define WGL_AUX1_ARB 0x2088
+#define WGL_AUX2_ARB 0x2089
+#define WGL_AUX3_ARB 0x208A
+#define WGL_AUX4_ARB 0x208B
+#define WGL_AUX5_ARB 0x208C
+#define WGL_AUX6_ARB 0x208D
+#define WGL_AUX7_ARB 0x208E
+#define WGL_AUX8_ARB 0x208F
+#define WGL_AUX9_ARB 0x2090
+
+typedef BOOL (WINAPI *PFNWGLBINDTEXIMAGEARBPROC)(HPBUFFERARB hPbuffer, int iBuffer);
+typedef BOOL (WINAPI *PFNWGLRELEASETEXIMAGEARBPROC)(HPBUFFERARB hPbuffer, int iBuffer);
+typedef BOOL (WINAPI *PFNWGLSETPBUFFERATTRIBARBPROC)(HPBUFFERARB hPbuffer, const int *piAttribList);
+
+#define wglBindTexImageARB WGLEW_GET_FUN(__wglewBindTexImageARB)
+#define wglReleaseTexImageARB WGLEW_GET_FUN(__wglewReleaseTexImageARB)
+#define wglSetPbufferAttribARB WGLEW_GET_FUN(__wglewSetPbufferAttribARB)
+
+#define WGLEW_ARB_render_texture WGLEW_GET_VAR(__WGLEW_ARB_render_texture)
+
+#endif /* WGL_ARB_render_texture */
+
+/* ----------------------- WGL_ATI_pixel_format_float ---------------------- */
+
+#ifndef WGL_ATI_pixel_format_float
+#define WGL_ATI_pixel_format_float 1
+
+#define WGL_TYPE_RGBA_FLOAT_ATI 0x21A0
+#define GL_RGBA_FLOAT_MODE_ATI 0x8820
+#define GL_COLOR_CLEAR_UNCLAMPED_VALUE_ATI 0x8835
+
+#define WGLEW_ATI_pixel_format_float WGLEW_GET_VAR(__WGLEW_ATI_pixel_format_float)
+
+#endif /* WGL_ATI_pixel_format_float */
+
+/* -------------------- WGL_ATI_render_texture_rectangle ------------------- */
+
+#ifndef WGL_ATI_render_texture_rectangle
+#define WGL_ATI_render_texture_rectangle 1
+
+#define WGL_TEXTURE_RECTANGLE_ATI 0x21A5
+
+#define WGLEW_ATI_render_texture_rectangle WGLEW_GET_VAR(__WGLEW_ATI_render_texture_rectangle)
+
+#endif /* WGL_ATI_render_texture_rectangle */
+
+/* -------------------------- WGL_EXT_depth_float -------------------------- */
+
+#ifndef WGL_EXT_depth_float
+#define WGL_EXT_depth_float 1
+
+#define WGL_DEPTH_FLOAT_EXT 0x2040
+
+#define WGLEW_EXT_depth_float WGLEW_GET_VAR(__WGLEW_EXT_depth_float)
+
+#endif /* WGL_EXT_depth_float */
+
+/* ---------------------- WGL_EXT_display_color_table ---------------------- */
+
+#ifndef WGL_EXT_display_color_table
+#define WGL_EXT_display_color_table 1
+
+typedef GLboolean(WINAPI *PFNWGLBINDDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+typedef GLboolean(WINAPI *PFNWGLCREATEDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+typedef void (WINAPI *PFNWGLDESTROYDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+typedef GLboolean(WINAPI *PFNWGLLOADDISPLAYCOLORTABLEEXTPROC)(GLushort *table, GLuint length);
+
+#define wglBindDisplayColorTableEXT WGLEW_GET_FUN(__wglewBindDisplayColorTableEXT)
+#define wglCreateDisplayColorTableEXT WGLEW_GET_FUN(__wglewCreateDisplayColorTableEXT)
+#define wglDestroyDisplayColorTableEXT WGLEW_GET_FUN(__wglewDestroyDisplayColorTableEXT)
+#define wglLoadDisplayColorTableEXT WGLEW_GET_FUN(__wglewLoadDisplayColorTableEXT)
+
+#define WGLEW_EXT_display_color_table WGLEW_GET_VAR(__WGLEW_EXT_display_color_table)
+
+#endif /* WGL_EXT_display_color_table */
+
+/* ----------------------- WGL_EXT_extensions_string ----------------------- */
+
+#ifndef WGL_EXT_extensions_string
+#define WGL_EXT_extensions_string 1
+
+typedef const char *(WINAPI *PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
+
+#define wglGetExtensionsStringEXT WGLEW_GET_FUN(__wglewGetExtensionsStringEXT)
+
+#define WGLEW_EXT_extensions_string WGLEW_GET_VAR(__WGLEW_EXT_extensions_string)
+
+#endif /* WGL_EXT_extensions_string */
+
+/* ------------------------ WGL_EXT_framebuffer_sRGB ----------------------- */
+
+#ifndef WGL_EXT_framebuffer_sRGB
+#define WGL_EXT_framebuffer_sRGB 1
+
+#define WGL_FRAMEBUFFER_SRGB_CAPABLE_EXT 0x20A9
+
+#define WGLEW_EXT_framebuffer_sRGB WGLEW_GET_VAR(__WGLEW_EXT_framebuffer_sRGB)
+
+#endif /* WGL_EXT_framebuffer_sRGB */
+
+/* ----------------------- WGL_EXT_make_current_read ----------------------- */
+
+#ifndef WGL_EXT_make_current_read
+#define WGL_EXT_make_current_read 1
+
+typedef HDC(WINAPI *PFNWGLGETCURRENTREADDCEXTPROC)(VOID);
+typedef BOOL (WINAPI *PFNWGLMAKECONTEXTCURRENTEXTPROC)(HDC hDrawDC, HDC hReadDC, HGLRC hglrc);
+
+#define wglGetCurrentReadDCEXT WGLEW_GET_FUN(__wglewGetCurrentReadDCEXT)
+#define wglMakeContextCurrentEXT WGLEW_GET_FUN(__wglewMakeContextCurrentEXT)
+
+#define WGLEW_EXT_make_current_read WGLEW_GET_VAR(__WGLEW_EXT_make_current_read)
+
+#endif /* WGL_EXT_make_current_read */
+
+/* -------------------------- WGL_EXT_multisample -------------------------- */
+
+#ifndef WGL_EXT_multisample
+#define WGL_EXT_multisample 1
+
+#define WGL_SAMPLE_BUFFERS_EXT 0x2041
+#define WGL_SAMPLES_EXT 0x2042
+
+#define WGLEW_EXT_multisample WGLEW_GET_VAR(__WGLEW_EXT_multisample)
+
+#endif /* WGL_EXT_multisample */
+
+/* ---------------------------- WGL_EXT_pbuffer ---------------------------- */
+
+#ifndef WGL_EXT_pbuffer
+#define WGL_EXT_pbuffer 1
+
+#define WGL_DRAW_TO_PBUFFER_EXT 0x202D
+#define WGL_MAX_PBUFFER_PIXELS_EXT 0x202E
+#define WGL_MAX_PBUFFER_WIDTH_EXT 0x202F
+#define WGL_MAX_PBUFFER_HEIGHT_EXT 0x2030
+#define WGL_OPTIMAL_PBUFFER_WIDTH_EXT 0x2031
+#define WGL_OPTIMAL_PBUFFER_HEIGHT_EXT 0x2032
+#define WGL_PBUFFER_LARGEST_EXT 0x2033
+#define WGL_PBUFFER_WIDTH_EXT 0x2034
+#define WGL_PBUFFER_HEIGHT_EXT 0x2035
+
+DECLARE_HANDLE(HPBUFFEREXT);
+
+typedef HPBUFFEREXT(WINAPI *PFNWGLCREATEPBUFFEREXTPROC)(HDC hDC, int iPixelFormat, int iWidth, int iHeight, const int *piAttribList);
+typedef BOOL (WINAPI *PFNWGLDESTROYPBUFFEREXTPROC)(HPBUFFEREXT hPbuffer);
+typedef HDC(WINAPI *PFNWGLGETPBUFFERDCEXTPROC)(HPBUFFEREXT hPbuffer);
+typedef BOOL (WINAPI *PFNWGLQUERYPBUFFEREXTPROC)(HPBUFFEREXT hPbuffer, int iAttribute, int *piValue);
+typedef int (WINAPI *PFNWGLRELEASEPBUFFERDCEXTPROC)(HPBUFFEREXT hPbuffer, HDC hDC);
+
+#define wglCreatePbufferEXT WGLEW_GET_FUN(__wglewCreatePbufferEXT)
+#define wglDestroyPbufferEXT WGLEW_GET_FUN(__wglewDestroyPbufferEXT)
+#define wglGetPbufferDCEXT WGLEW_GET_FUN(__wglewGetPbufferDCEXT)
+#define wglQueryPbufferEXT WGLEW_GET_FUN(__wglewQueryPbufferEXT)
+#define wglReleasePbufferDCEXT WGLEW_GET_FUN(__wglewReleasePbufferDCEXT)
+
+#define WGLEW_EXT_pbuffer WGLEW_GET_VAR(__WGLEW_EXT_pbuffer)
+
+#endif /* WGL_EXT_pbuffer */
+
+/* -------------------------- WGL_EXT_pixel_format ------------------------- */
+
+#ifndef WGL_EXT_pixel_format
+#define WGL_EXT_pixel_format 1
+
+#define WGL_NUMBER_PIXEL_FORMATS_EXT 0x2000
+#define WGL_DRAW_TO_WINDOW_EXT 0x2001
+#define WGL_DRAW_TO_BITMAP_EXT 0x2002
+#define WGL_ACCELERATION_EXT 0x2003
+#define WGL_NEED_PALETTE_EXT 0x2004
+#define WGL_NEED_SYSTEM_PALETTE_EXT 0x2005
+#define WGL_SWAP_LAYER_BUFFERS_EXT 0x2006
+#define WGL_SWAP_METHOD_EXT 0x2007
+#define WGL_NUMBER_OVERLAYS_EXT 0x2008
+#define WGL_NUMBER_UNDERLAYS_EXT 0x2009
+#define WGL_TRANSPARENT_EXT 0x200A
+#define WGL_TRANSPARENT_VALUE_EXT 0x200B
+#define WGL_SHARE_DEPTH_EXT 0x200C
+#define WGL_SHARE_STENCIL_EXT 0x200D
+#define WGL_SHARE_ACCUM_EXT 0x200E
+#define WGL_SUPPORT_GDI_EXT 0x200F
+#define WGL_SUPPORT_OPENGL_EXT 0x2010
+#define WGL_DOUBLE_BUFFER_EXT 0x2011
+#define WGL_STEREO_EXT 0x2012
+#define WGL_PIXEL_TYPE_EXT 0x2013
+#define WGL_COLOR_BITS_EXT 0x2014
+#define WGL_RED_BITS_EXT 0x2015
+#define WGL_RED_SHIFT_EXT 0x2016
+#define WGL_GREEN_BITS_EXT 0x2017
+#define WGL_GREEN_SHIFT_EXT 0x2018
+#define WGL_BLUE_BITS_EXT 0x2019
+#define WGL_BLUE_SHIFT_EXT 0x201A
+#define WGL_ALPHA_BITS_EXT 0x201B
+#define WGL_ALPHA_SHIFT_EXT 0x201C
+#define WGL_ACCUM_BITS_EXT 0x201D
+#define WGL_ACCUM_RED_BITS_EXT 0x201E
+#define WGL_ACCUM_GREEN_BITS_EXT 0x201F
+#define WGL_ACCUM_BLUE_BITS_EXT 0x2020
+#define WGL_ACCUM_ALPHA_BITS_EXT 0x2021
+#define WGL_DEPTH_BITS_EXT 0x2022
+#define WGL_STENCIL_BITS_EXT 0x2023
+#define WGL_AUX_BUFFERS_EXT 0x2024
+#define WGL_NO_ACCELERATION_EXT 0x2025
+#define WGL_GENERIC_ACCELERATION_EXT 0x2026
+#define WGL_FULL_ACCELERATION_EXT 0x2027
+#define WGL_SWAP_EXCHANGE_EXT 0x2028
+#define WGL_SWAP_COPY_EXT 0x2029
+#define WGL_SWAP_UNDEFINED_EXT 0x202A
+#define WGL_TYPE_RGBA_EXT 0x202B
+#define WGL_TYPE_COLORINDEX_EXT 0x202C
+
+typedef BOOL (WINAPI *PFNWGLCHOOSEPIXELFORMATEXTPROC)(HDC hdc, const int *piAttribIList, const FLOAT *pfAttribFList, UINT nMaxFormats, int *piFormats, UINT *nNumFormats);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBFVEXTPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, int *piAttributes, FLOAT *pfValues);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBIVEXTPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, int *piAttributes, int *piValues);
+
+#define wglChoosePixelFormatEXT WGLEW_GET_FUN(__wglewChoosePixelFormatEXT)
+#define wglGetPixelFormatAttribfvEXT WGLEW_GET_FUN(__wglewGetPixelFormatAttribfvEXT)
+#define wglGetPixelFormatAttribivEXT WGLEW_GET_FUN(__wglewGetPixelFormatAttribivEXT)
+
+#define WGLEW_EXT_pixel_format WGLEW_GET_VAR(__WGLEW_EXT_pixel_format)
+
+#endif /* WGL_EXT_pixel_format */
+
+/* ------------------- WGL_EXT_pixel_format_packed_float ------------------- */
+
+#ifndef WGL_EXT_pixel_format_packed_float
+#define WGL_EXT_pixel_format_packed_float 1
+
+#define WGL_TYPE_RGBA_UNSIGNED_FLOAT_EXT 0x20A8
+
+#define WGLEW_EXT_pixel_format_packed_float WGLEW_GET_VAR(__WGLEW_EXT_pixel_format_packed_float)
+
+#endif /* WGL_EXT_pixel_format_packed_float */
+
+/* -------------------------- WGL_EXT_swap_control ------------------------- */
+
+#ifndef WGL_EXT_swap_control
+#define WGL_EXT_swap_control 1
+
+typedef int (WINAPI *PFNWGLGETSWAPINTERVALEXTPROC)(void);
+typedef BOOL (WINAPI *PFNWGLSWAPINTERVALEXTPROC)(int interval);
+
+#define wglGetSwapIntervalEXT WGLEW_GET_FUN(__wglewGetSwapIntervalEXT)
+#define wglSwapIntervalEXT WGLEW_GET_FUN(__wglewSwapIntervalEXT)
+
+#define WGLEW_EXT_swap_control WGLEW_GET_VAR(__WGLEW_EXT_swap_control)
+
+#endif /* WGL_EXT_swap_control */
+
+/* --------------------- WGL_I3D_digital_video_control --------------------- */
+
+#ifndef WGL_I3D_digital_video_control
+#define WGL_I3D_digital_video_control 1
+
+#define WGL_DIGITAL_VIDEO_CURSOR_ALPHA_FRAMEBUFFER_I3D 0x2050
+#define WGL_DIGITAL_VIDEO_CURSOR_ALPHA_VALUE_I3D 0x2051
+#define WGL_DIGITAL_VIDEO_CURSOR_INCLUDED_I3D 0x2052
+#define WGL_DIGITAL_VIDEO_GAMMA_CORRECTED_I3D 0x2053
+
+typedef BOOL (WINAPI *PFNWGLGETDIGITALVIDEOPARAMETERSI3DPROC)(HDC hDC, int iAttribute, int *piValue);
+typedef BOOL (WINAPI *PFNWGLSETDIGITALVIDEOPARAMETERSI3DPROC)(HDC hDC, int iAttribute, const int *piValue);
+
+#define wglGetDigitalVideoParametersI3D WGLEW_GET_FUN(__wglewGetDigitalVideoParametersI3D)
+#define wglSetDigitalVideoParametersI3D WGLEW_GET_FUN(__wglewSetDigitalVideoParametersI3D)
+
+#define WGLEW_I3D_digital_video_control WGLEW_GET_VAR(__WGLEW_I3D_digital_video_control)
+
+#endif /* WGL_I3D_digital_video_control */
+
+/* ----------------------------- WGL_I3D_gamma ----------------------------- */
+
+#ifndef WGL_I3D_gamma
+#define WGL_I3D_gamma 1
+
+#define WGL_GAMMA_TABLE_SIZE_I3D 0x204E
+#define WGL_GAMMA_EXCLUDE_DESKTOP_I3D 0x204F
+
+typedef BOOL (WINAPI *PFNWGLGETGAMMATABLEI3DPROC)(HDC hDC, int iEntries, USHORT *puRed, USHORT *puGreen, USHORT *puBlue);
+typedef BOOL (WINAPI *PFNWGLGETGAMMATABLEPARAMETERSI3DPROC)(HDC hDC, int iAttribute, int *piValue);
+typedef BOOL (WINAPI *PFNWGLSETGAMMATABLEI3DPROC)(HDC hDC, int iEntries, const USHORT *puRed, const USHORT *puGreen, const USHORT *puBlue);
+typedef BOOL (WINAPI *PFNWGLSETGAMMATABLEPARAMETERSI3DPROC)(HDC hDC, int iAttribute, const int *piValue);
+
+#define wglGetGammaTableI3D WGLEW_GET_FUN(__wglewGetGammaTableI3D)
+#define wglGetGammaTableParametersI3D WGLEW_GET_FUN(__wglewGetGammaTableParametersI3D)
+#define wglSetGammaTableI3D WGLEW_GET_FUN(__wglewSetGammaTableI3D)
+#define wglSetGammaTableParametersI3D WGLEW_GET_FUN(__wglewSetGammaTableParametersI3D)
+
+#define WGLEW_I3D_gamma WGLEW_GET_VAR(__WGLEW_I3D_gamma)
+
+#endif /* WGL_I3D_gamma */
+
+/* ---------------------------- WGL_I3D_genlock ---------------------------- */
+
+#ifndef WGL_I3D_genlock
+#define WGL_I3D_genlock 1
+
+#define WGL_GENLOCK_SOURCE_MULTIVIEW_I3D 0x2044
+#define WGL_GENLOCK_SOURCE_EXTERNAL_SYNC_I3D 0x2045
+#define WGL_GENLOCK_SOURCE_EXTERNAL_FIELD_I3D 0x2046
+#define WGL_GENLOCK_SOURCE_EXTERNAL_TTL_I3D 0x2047
+#define WGL_GENLOCK_SOURCE_DIGITAL_SYNC_I3D 0x2048
+#define WGL_GENLOCK_SOURCE_DIGITAL_FIELD_I3D 0x2049
+#define WGL_GENLOCK_SOURCE_EDGE_FALLING_I3D 0x204A
+#define WGL_GENLOCK_SOURCE_EDGE_RISING_I3D 0x204B
+#define WGL_GENLOCK_SOURCE_EDGE_BOTH_I3D 0x204C
+
+typedef BOOL (WINAPI *PFNWGLDISABLEGENLOCKI3DPROC)(HDC hDC);
+typedef BOOL (WINAPI *PFNWGLENABLEGENLOCKI3DPROC)(HDC hDC);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSAMPLERATEI3DPROC)(HDC hDC, UINT uRate);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEDELAYI3DPROC)(HDC hDC, UINT uDelay);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEEDGEI3DPROC)(HDC hDC, UINT uEdge);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEI3DPROC)(HDC hDC, UINT uSource);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSAMPLERATEI3DPROC)(HDC hDC, UINT *uRate);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEDELAYI3DPROC)(HDC hDC, UINT *uDelay);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEEDGEI3DPROC)(HDC hDC, UINT *uEdge);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEI3DPROC)(HDC hDC, UINT *uSource);
+typedef BOOL (WINAPI *PFNWGLISENABLEDGENLOCKI3DPROC)(HDC hDC, BOOL *pFlag);
+typedef BOOL (WINAPI *PFNWGLQUERYGENLOCKMAXSOURCEDELAYI3DPROC)(HDC hDC, UINT *uMaxLineDelay, UINT *uMaxPixelDelay);
+
+#define wglDisableGenlockI3D WGLEW_GET_FUN(__wglewDisableGenlockI3D)
+#define wglEnableGenlockI3D WGLEW_GET_FUN(__wglewEnableGenlockI3D)
+#define wglGenlockSampleRateI3D WGLEW_GET_FUN(__wglewGenlockSampleRateI3D)
+#define wglGenlockSourceDelayI3D WGLEW_GET_FUN(__wglewGenlockSourceDelayI3D)
+#define wglGenlockSourceEdgeI3D WGLEW_GET_FUN(__wglewGenlockSourceEdgeI3D)
+#define wglGenlockSourceI3D WGLEW_GET_FUN(__wglewGenlockSourceI3D)
+#define wglGetGenlockSampleRateI3D WGLEW_GET_FUN(__wglewGetGenlockSampleRateI3D)
+#define wglGetGenlockSourceDelayI3D WGLEW_GET_FUN(__wglewGetGenlockSourceDelayI3D)
+#define wglGetGenlockSourceEdgeI3D WGLEW_GET_FUN(__wglewGetGenlockSourceEdgeI3D)
+#define wglGetGenlockSourceI3D WGLEW_GET_FUN(__wglewGetGenlockSourceI3D)
+#define wglIsEnabledGenlockI3D WGLEW_GET_FUN(__wglewIsEnabledGenlockI3D)
+#define wglQueryGenlockMaxSourceDelayI3D WGLEW_GET_FUN(__wglewQueryGenlockMaxSourceDelayI3D)
+
+#define WGLEW_I3D_genlock WGLEW_GET_VAR(__WGLEW_I3D_genlock)
+
+#endif /* WGL_I3D_genlock */
+
+/* -------------------------- WGL_I3D_image_buffer ------------------------- */
+
+#ifndef WGL_I3D_image_buffer
+#define WGL_I3D_image_buffer 1
+
+#define WGL_IMAGE_BUFFER_MIN_ACCESS_I3D 0x00000001
+#define WGL_IMAGE_BUFFER_LOCK_I3D 0x00000002
+
+typedef BOOL (WINAPI *PFNWGLASSOCIATEIMAGEBUFFEREVENTSI3DPROC)(HDC hdc, HANDLE *pEvent, LPVOID *pAddress, DWORD *pSize, UINT count);
+typedef LPVOID (WINAPI *PFNWGLCREATEIMAGEBUFFERI3DPROC)(HDC hDC, DWORD dwSize, UINT uFlags);
+typedef BOOL (WINAPI *PFNWGLDESTROYIMAGEBUFFERI3DPROC)(HDC hDC, LPVOID pAddress);
+typedef BOOL (WINAPI *PFNWGLRELEASEIMAGEBUFFEREVENTSI3DPROC)(HDC hdc, LPVOID *pAddress, UINT count);
+
+#define wglAssociateImageBufferEventsI3D WGLEW_GET_FUN(__wglewAssociateImageBufferEventsI3D)
+#define wglCreateImageBufferI3D WGLEW_GET_FUN(__wglewCreateImageBufferI3D)
+#define wglDestroyImageBufferI3D WGLEW_GET_FUN(__wglewDestroyImageBufferI3D)
+#define wglReleaseImageBufferEventsI3D WGLEW_GET_FUN(__wglewReleaseImageBufferEventsI3D)
+
+#define WGLEW_I3D_image_buffer WGLEW_GET_VAR(__WGLEW_I3D_image_buffer)
+
+#endif /* WGL_I3D_image_buffer */
+
+/* ------------------------ WGL_I3D_swap_frame_lock ------------------------ */
+
+#ifndef WGL_I3D_swap_frame_lock
+#define WGL_I3D_swap_frame_lock 1
+
+typedef BOOL (WINAPI *PFNWGLDISABLEFRAMELOCKI3DPROC)(VOID);
+typedef BOOL (WINAPI *PFNWGLENABLEFRAMELOCKI3DPROC)(VOID);
+typedef BOOL (WINAPI *PFNWGLISENABLEDFRAMELOCKI3DPROC)(BOOL *pFlag);
+typedef BOOL (WINAPI *PFNWGLQUERYFRAMELOCKMASTERI3DPROC)(BOOL *pFlag);
+
+#define wglDisableFrameLockI3D WGLEW_GET_FUN(__wglewDisableFrameLockI3D)
+#define wglEnableFrameLockI3D WGLEW_GET_FUN(__wglewEnableFrameLockI3D)
+#define wglIsEnabledFrameLockI3D WGLEW_GET_FUN(__wglewIsEnabledFrameLockI3D)
+#define wglQueryFrameLockMasterI3D WGLEW_GET_FUN(__wglewQueryFrameLockMasterI3D)
+
+#define WGLEW_I3D_swap_frame_lock WGLEW_GET_VAR(__WGLEW_I3D_swap_frame_lock)
+
+#endif /* WGL_I3D_swap_frame_lock */
+
+/* ------------------------ WGL_I3D_swap_frame_usage ----------------------- */
+
+#ifndef WGL_I3D_swap_frame_usage
+#define WGL_I3D_swap_frame_usage 1
+
+typedef BOOL (WINAPI *PFNWGLBEGINFRAMETRACKINGI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLENDFRAMETRACKINGI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLGETFRAMEUSAGEI3DPROC)(float *pUsage);
+typedef BOOL (WINAPI *PFNWGLQUERYFRAMETRACKINGI3DPROC)(DWORD *pFrameCount, DWORD *pMissedFrames, float *pLastMissedUsage);
+
+#define wglBeginFrameTrackingI3D WGLEW_GET_FUN(__wglewBeginFrameTrackingI3D)
+#define wglEndFrameTrackingI3D WGLEW_GET_FUN(__wglewEndFrameTrackingI3D)
+#define wglGetFrameUsageI3D WGLEW_GET_FUN(__wglewGetFrameUsageI3D)
+#define wglQueryFrameTrackingI3D WGLEW_GET_FUN(__wglewQueryFrameTrackingI3D)
+
+#define WGLEW_I3D_swap_frame_usage WGLEW_GET_VAR(__WGLEW_I3D_swap_frame_usage)
+
+#endif /* WGL_I3D_swap_frame_usage */
+
+/* -------------------------- WGL_NV_float_buffer -------------------------- */
+
+#ifndef WGL_NV_float_buffer
+#define WGL_NV_float_buffer 1
+
+#define WGL_FLOAT_COMPONENTS_NV 0x20B0
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_R_NV 0x20B1
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RG_NV 0x20B2
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RGB_NV 0x20B3
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RGBA_NV 0x20B4
+#define WGL_TEXTURE_FLOAT_R_NV 0x20B5
+#define WGL_TEXTURE_FLOAT_RG_NV 0x20B6
+#define WGL_TEXTURE_FLOAT_RGB_NV 0x20B7
+#define WGL_TEXTURE_FLOAT_RGBA_NV 0x20B8
+
+#define WGLEW_NV_float_buffer WGLEW_GET_VAR(__WGLEW_NV_float_buffer)
+
+#endif /* WGL_NV_float_buffer */
+
+/* ---------------------- WGL_NV_render_depth_texture ---------------------- */
+
+#ifndef WGL_NV_render_depth_texture
+#define WGL_NV_render_depth_texture 1
+
+#define WGL_NO_TEXTURE_ARB 0x2077
+#define WGL_BIND_TO_TEXTURE_DEPTH_NV 0x20A3
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_DEPTH_NV 0x20A4
+#define WGL_DEPTH_TEXTURE_FORMAT_NV 0x20A5
+#define WGL_TEXTURE_DEPTH_COMPONENT_NV 0x20A6
+#define WGL_DEPTH_COMPONENT_NV 0x20A7
+
+#define WGLEW_NV_render_depth_texture WGLEW_GET_VAR(__WGLEW_NV_render_depth_texture)
+
+#endif /* WGL_NV_render_depth_texture */
+
+/* -------------------- WGL_NV_render_texture_rectangle -------------------- */
+
+#ifndef WGL_NV_render_texture_rectangle
+#define WGL_NV_render_texture_rectangle 1
+
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_RGB_NV 0x20A0
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_RGBA_NV 0x20A1
+#define WGL_TEXTURE_RECTANGLE_NV 0x20A2
+
+#define WGLEW_NV_render_texture_rectangle WGLEW_GET_VAR(__WGLEW_NV_render_texture_rectangle)
+
+#endif /* WGL_NV_render_texture_rectangle */
+
+/* ----------------------- WGL_NV_vertex_array_range ----------------------- */
+
+#ifndef WGL_NV_vertex_array_range
+#define WGL_NV_vertex_array_range 1
+
+typedef void *(WINAPI *PFNWGLALLOCATEMEMORYNVPROC)(GLsizei size, GLfloat readFrequency, GLfloat writeFrequency, GLfloat priority);
+typedef void (WINAPI *PFNWGLFREEMEMORYNVPROC)(void *pointer);
+
+#define wglAllocateMemoryNV WGLEW_GET_FUN(__wglewAllocateMemoryNV)
+#define wglFreeMemoryNV WGLEW_GET_FUN(__wglewFreeMemoryNV)
+
+#define WGLEW_NV_vertex_array_range WGLEW_GET_VAR(__WGLEW_NV_vertex_array_range)
+
+#endif /* WGL_NV_vertex_array_range */
+
+/* -------------------------- WGL_OML_sync_control ------------------------- */
+
+#ifndef WGL_OML_sync_control
+#define WGL_OML_sync_control 1
+
+typedef BOOL (WINAPI *PFNWGLGETMSCRATEOMLPROC)(HDC hdc, INT32 *numerator, INT32 *denominator);
+typedef BOOL (WINAPI *PFNWGLGETSYNCVALUESOMLPROC)(HDC hdc, INT64 *ust, INT64 *msc, INT64 *sbc);
+typedef INT64(WINAPI *PFNWGLSWAPBUFFERSMSCOMLPROC)(HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder);
+typedef INT64(WINAPI *PFNWGLSWAPLAYERBUFFERSMSCOMLPROC)(HDC hdc, INT fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder);
+typedef BOOL (WINAPI *PFNWGLWAITFORMSCOMLPROC)(HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder, INT64 *ust, INT64 *msc, INT64 *sbc);
+typedef BOOL (WINAPI *PFNWGLWAITFORSBCOMLPROC)(HDC hdc, INT64 target_sbc, INT64 *ust, INT64 *msc, INT64 *sbc);
+
+#define wglGetMscRateOML WGLEW_GET_FUN(__wglewGetMscRateOML)
+#define wglGetSyncValuesOML WGLEW_GET_FUN(__wglewGetSyncValuesOML)
+#define wglSwapBuffersMscOML WGLEW_GET_FUN(__wglewSwapBuffersMscOML)
+#define wglSwapLayerBuffersMscOML WGLEW_GET_FUN(__wglewSwapLayerBuffersMscOML)
+#define wglWaitForMscOML WGLEW_GET_FUN(__wglewWaitForMscOML)
+#define wglWaitForSbcOML WGLEW_GET_FUN(__wglewWaitForSbcOML)
+
+#define WGLEW_OML_sync_control WGLEW_GET_VAR(__WGLEW_OML_sync_control)
+
+#endif /* WGL_OML_sync_control */
+
+/* ------------------------------------------------------------------------- */
+
+#ifdef GLEW_MX
+#define WGLEW_EXPORT
+#else
+#define WGLEW_EXPORT GLEWAPI
+#endif /* GLEW_MX */
+
+#ifdef GLEW_MX
+struct WGLEWContextStruct
+{
+#endif /* GLEW_MX */
+
+    WGLEW_EXPORT PFNWGLSETSTEREOEMITTERSTATE3DLPROC __wglewSetStereoEmitterState3DL;
+
+    WGLEW_EXPORT PFNWGLCREATEBUFFERREGIONARBPROC __wglewCreateBufferRegionARB;
+    WGLEW_EXPORT PFNWGLDELETEBUFFERREGIONARBPROC __wglewDeleteBufferRegionARB;
+    WGLEW_EXPORT PFNWGLRESTOREBUFFERREGIONARBPROC __wglewRestoreBufferRegionARB;
+    WGLEW_EXPORT PFNWGLSAVEBUFFERREGIONARBPROC __wglewSaveBufferRegionARB;
+
+    WGLEW_EXPORT PFNWGLGETEXTENSIONSSTRINGARBPROC __wglewGetExtensionsStringARB;
+
+    WGLEW_EXPORT PFNWGLGETCURRENTREADDCARBPROC __wglewGetCurrentReadDCARB;
+    WGLEW_EXPORT PFNWGLMAKECONTEXTCURRENTARBPROC __wglewMakeContextCurrentARB;
+
+    WGLEW_EXPORT PFNWGLCREATEPBUFFERARBPROC __wglewCreatePbufferARB;
+    WGLEW_EXPORT PFNWGLDESTROYPBUFFERARBPROC __wglewDestroyPbufferARB;
+    WGLEW_EXPORT PFNWGLGETPBUFFERDCARBPROC __wglewGetPbufferDCARB;
+    WGLEW_EXPORT PFNWGLQUERYPBUFFERARBPROC __wglewQueryPbufferARB;
+    WGLEW_EXPORT PFNWGLRELEASEPBUFFERDCARBPROC __wglewReleasePbufferDCARB;
+
+    WGLEW_EXPORT PFNWGLCHOOSEPIXELFORMATARBPROC __wglewChoosePixelFormatARB;
+    WGLEW_EXPORT PFNWGLGETPIXELFORMATATTRIBFVARBPROC __wglewGetPixelFormatAttribfvARB;
+    WGLEW_EXPORT PFNWGLGETPIXELFORMATATTRIBIVARBPROC __wglewGetPixelFormatAttribivARB;
+
+    WGLEW_EXPORT PFNWGLBINDTEXIMAGEARBPROC __wglewBindTexImageARB;
+    WGLEW_EXPORT PFNWGLRELEASETEXIMAGEARBPROC __wglewReleaseTexImageARB;
+    WGLEW_EXPORT PFNWGLSETPBUFFERATTRIBARBPROC __wglewSetPbufferAttribARB;
+
+    WGLEW_EXPORT PFNWGLBINDDISPLAYCOLORTABLEEXTPROC __wglewBindDisplayColorTableEXT;
+    WGLEW_EXPORT PFNWGLCREATEDISPLAYCOLORTABLEEXTPROC __wglewCreateDisplayColorTableEXT;
+    WGLEW_EXPORT PFNWGLDESTROYDISPLAYCOLORTABLEEXTPROC __wglewDestroyDisplayColorTableEXT;
+    WGLEW_EXPORT PFNWGLLOADDISPLAYCOLORTABLEEXTPROC __wglewLoadDisplayColorTableEXT;
+
+    WGLEW_EXPORT PFNWGLGETEXTENSIONSSTRINGEXTPROC __wglewGetExtensionsStringEXT;
+
+    WGLEW_EXPORT PFNWGLGETCURRENTREADDCEXTPROC __wglewGetCurrentReadDCEXT;
+    WGLEW_EXPORT PFNWGLMAKECONTEXTCURRENTEXTPROC __wglewMakeContextCurrentEXT;
+
+    WGLEW_EXPORT PFNWGLCREATEPBUFFEREXTPROC __wglewCreatePbufferEXT;
+    WGLEW_EXPORT PFNWGLDESTROYPBUFFEREXTPROC __wglewDestroyPbufferEXT;
+    WGLEW_EXPORT PFNWGLGETPBUFFERDCEXTPROC __wglewGetPbufferDCEXT;
+    WGLEW_EXPORT PFNWGLQUERYPBUFFEREXTPROC __wglewQueryPbufferEXT;
+    WGLEW_EXPORT PFNWGLRELEASEPBUFFERDCEXTPROC __wglewReleasePbufferDCEXT;
+
+    WGLEW_EXPORT PFNWGLCHOOSEPIXELFORMATEXTPROC __wglewChoosePixelFormatEXT;
+    WGLEW_EXPORT PFNWGLGETPIXELFORMATATTRIBFVEXTPROC __wglewGetPixelFormatAttribfvEXT;
+    WGLEW_EXPORT PFNWGLGETPIXELFORMATATTRIBIVEXTPROC __wglewGetPixelFormatAttribivEXT;
+
+    WGLEW_EXPORT PFNWGLGETSWAPINTERVALEXTPROC __wglewGetSwapIntervalEXT;
+    WGLEW_EXPORT PFNWGLSWAPINTERVALEXTPROC __wglewSwapIntervalEXT;
+
+    WGLEW_EXPORT PFNWGLGETDIGITALVIDEOPARAMETERSI3DPROC __wglewGetDigitalVideoParametersI3D;
+    WGLEW_EXPORT PFNWGLSETDIGITALVIDEOPARAMETERSI3DPROC __wglewSetDigitalVideoParametersI3D;
+
+    WGLEW_EXPORT PFNWGLGETGAMMATABLEI3DPROC __wglewGetGammaTableI3D;
+    WGLEW_EXPORT PFNWGLGETGAMMATABLEPARAMETERSI3DPROC __wglewGetGammaTableParametersI3D;
+    WGLEW_EXPORT PFNWGLSETGAMMATABLEI3DPROC __wglewSetGammaTableI3D;
+    WGLEW_EXPORT PFNWGLSETGAMMATABLEPARAMETERSI3DPROC __wglewSetGammaTableParametersI3D;
+
+    WGLEW_EXPORT PFNWGLDISABLEGENLOCKI3DPROC __wglewDisableGenlockI3D;
+    WGLEW_EXPORT PFNWGLENABLEGENLOCKI3DPROC __wglewEnableGenlockI3D;
+    WGLEW_EXPORT PFNWGLGENLOCKSAMPLERATEI3DPROC __wglewGenlockSampleRateI3D;
+    WGLEW_EXPORT PFNWGLGENLOCKSOURCEDELAYI3DPROC __wglewGenlockSourceDelayI3D;
+    WGLEW_EXPORT PFNWGLGENLOCKSOURCEEDGEI3DPROC __wglewGenlockSourceEdgeI3D;
+    WGLEW_EXPORT PFNWGLGENLOCKSOURCEI3DPROC __wglewGenlockSourceI3D;
+    WGLEW_EXPORT PFNWGLGETGENLOCKSAMPLERATEI3DPROC __wglewGetGenlockSampleRateI3D;
+    WGLEW_EXPORT PFNWGLGETGENLOCKSOURCEDELAYI3DPROC __wglewGetGenlockSourceDelayI3D;
+    WGLEW_EXPORT PFNWGLGETGENLOCKSOURCEEDGEI3DPROC __wglewGetGenlockSourceEdgeI3D;
+    WGLEW_EXPORT PFNWGLGETGENLOCKSOURCEI3DPROC __wglewGetGenlockSourceI3D;
+    WGLEW_EXPORT PFNWGLISENABLEDGENLOCKI3DPROC __wglewIsEnabledGenlockI3D;
+    WGLEW_EXPORT PFNWGLQUERYGENLOCKMAXSOURCEDELAYI3DPROC __wglewQueryGenlockMaxSourceDelayI3D;
+
+    WGLEW_EXPORT PFNWGLASSOCIATEIMAGEBUFFEREVENTSI3DPROC __wglewAssociateImageBufferEventsI3D;
+    WGLEW_EXPORT PFNWGLCREATEIMAGEBUFFERI3DPROC __wglewCreateImageBufferI3D;
+    WGLEW_EXPORT PFNWGLDESTROYIMAGEBUFFERI3DPROC __wglewDestroyImageBufferI3D;
+    WGLEW_EXPORT PFNWGLRELEASEIMAGEBUFFEREVENTSI3DPROC __wglewReleaseImageBufferEventsI3D;
+
+    WGLEW_EXPORT PFNWGLDISABLEFRAMELOCKI3DPROC __wglewDisableFrameLockI3D;
+    WGLEW_EXPORT PFNWGLENABLEFRAMELOCKI3DPROC __wglewEnableFrameLockI3D;
+    WGLEW_EXPORT PFNWGLISENABLEDFRAMELOCKI3DPROC __wglewIsEnabledFrameLockI3D;
+    WGLEW_EXPORT PFNWGLQUERYFRAMELOCKMASTERI3DPROC __wglewQueryFrameLockMasterI3D;
+
+    WGLEW_EXPORT PFNWGLBEGINFRAMETRACKINGI3DPROC __wglewBeginFrameTrackingI3D;
+    WGLEW_EXPORT PFNWGLENDFRAMETRACKINGI3DPROC __wglewEndFrameTrackingI3D;
+    WGLEW_EXPORT PFNWGLGETFRAMEUSAGEI3DPROC __wglewGetFrameUsageI3D;
+    WGLEW_EXPORT PFNWGLQUERYFRAMETRACKINGI3DPROC __wglewQueryFrameTrackingI3D;
+
+    WGLEW_EXPORT PFNWGLALLOCATEMEMORYNVPROC __wglewAllocateMemoryNV;
+    WGLEW_EXPORT PFNWGLFREEMEMORYNVPROC __wglewFreeMemoryNV;
+
+    WGLEW_EXPORT PFNWGLGETMSCRATEOMLPROC __wglewGetMscRateOML;
+    WGLEW_EXPORT PFNWGLGETSYNCVALUESOMLPROC __wglewGetSyncValuesOML;
+    WGLEW_EXPORT PFNWGLSWAPBUFFERSMSCOMLPROC __wglewSwapBuffersMscOML;
+    WGLEW_EXPORT PFNWGLSWAPLAYERBUFFERSMSCOMLPROC __wglewSwapLayerBuffersMscOML;
+    WGLEW_EXPORT PFNWGLWAITFORMSCOMLPROC __wglewWaitForMscOML;
+    WGLEW_EXPORT PFNWGLWAITFORSBCOMLPROC __wglewWaitForSbcOML;
+    WGLEW_EXPORT GLboolean __WGLEW_3DFX_multisample;
+    WGLEW_EXPORT GLboolean __WGLEW_3DL_stereo_control;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_buffer_region;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_extensions_string;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_make_current_read;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_multisample;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_pbuffer;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_pixel_format;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_pixel_format_float;
+    WGLEW_EXPORT GLboolean __WGLEW_ARB_render_texture;
+    WGLEW_EXPORT GLboolean __WGLEW_ATI_pixel_format_float;
+    WGLEW_EXPORT GLboolean __WGLEW_ATI_render_texture_rectangle;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_depth_float;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_display_color_table;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_extensions_string;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_framebuffer_sRGB;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_make_current_read;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_multisample;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_pbuffer;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_pixel_format;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_pixel_format_packed_float;
+    WGLEW_EXPORT GLboolean __WGLEW_EXT_swap_control;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_digital_video_control;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_gamma;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_genlock;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_image_buffer;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_swap_frame_lock;
+    WGLEW_EXPORT GLboolean __WGLEW_I3D_swap_frame_usage;
+    WGLEW_EXPORT GLboolean __WGLEW_NV_float_buffer;
+    WGLEW_EXPORT GLboolean __WGLEW_NV_render_depth_texture;
+    WGLEW_EXPORT GLboolean __WGLEW_NV_render_texture_rectangle;
+    WGLEW_EXPORT GLboolean __WGLEW_NV_vertex_array_range;
+    WGLEW_EXPORT GLboolean __WGLEW_OML_sync_control;
+
+#ifdef GLEW_MX
+}; /* WGLEWContextStruct */
+#endif /* GLEW_MX */
+
+/* ------------------------------------------------------------------------- */
+
+#ifdef GLEW_MX
+
+typedef struct WGLEWContextStruct WGLEWContext;
+GLEWAPI GLenum wglewContextInit(WGLEWContext *ctx);
+GLEWAPI GLboolean wglewContextIsSupported(WGLEWContext *ctx, const char *name);
+
+#define wglewInit() wglewContextInit(wglewGetContext())
+#define wglewIsSupported(x) wglewContextIsSupported(wglewGetContext(), x)
+
+#define WGLEW_GET_VAR(x) wglewGetContext()->x
+#define WGLEW_GET_FUN(x) wglewGetContext()->x
+
+#else /* GLEW_MX */
+
+#define WGLEW_GET_VAR(x) x
+#define WGLEW_GET_FUN(x) x
+
+GLEWAPI GLboolean wglewIsSupported(const char *name);
+
+#endif /* GLEW_MX */
+
+GLEWAPI GLboolean wglewGetExtension(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef GLEWAPI
+
+#endif /* __wglew_h__ */

+ 696 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/GL/wglext.h

@@ -0,0 +1,696 @@
+#ifndef __wglext_h_
+#define __wglext_h_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** License Applicability. Except to the extent portions of this file are
+** made subject to an alternative license as permitted in the SGI Free
+** Software License B, Version 1.1 (the "License"), the contents of this
+** file are subject only to the provisions of the License. You may not use
+** this file except in compliance with the License. You may obtain a copy
+** of the License at Silicon Graphics, Inc., attn: Legal Services, 1600
+** Amphitheatre Parkway, Mountain View, CA 94043-1351, or at:
+**
+** http://oss.sgi.com/projects/FreeB
+**
+** Note that, as provided in the License, the Software is distributed on an
+** "AS IS" basis, with ALL EXPRESS AND IMPLIED WARRANTIES AND CONDITIONS
+** DISCLAIMED, INCLUDING, WITHOUT LIMITATION, ANY IMPLIED WARRANTIES AND
+** CONDITIONS OF MERCHANTABILITY, SATISFACTORY QUALITY, FITNESS FOR A
+** PARTICULAR PURPOSE, AND NON-INFRINGEMENT.
+**
+** Original Code. The Original Code is: OpenGL Sample Implementation,
+** Version 1.2.1, released January 26, 2000, developed by Silicon Graphics,
+** Inc. The Original Code is Copyright (c) 1991-2004 Silicon Graphics, Inc.
+** Copyright in any portions created by third parties is as indicated
+** elsewhere herein. All Rights Reserved.
+**
+** Additional Notice Provisions: This software was created using the
+** OpenGL(R) version 1.2.1 Sample Implementation published by SGI, but has
+** not been independently verified as being compliant with the OpenGL(R)
+** version 1.2.1 Specification.
+*/
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
+#define WIN32_LEAN_AND_MEAN 1
+#include <windows.h>
+#endif
+
+#ifndef APIENTRY
+#define APIENTRY
+#endif
+#ifndef APIENTRYP
+#define APIENTRYP APIENTRY *
+#endif
+#ifndef GLAPI
+#define GLAPI extern
+#endif
+
+
+/*************************************************************/
+
+/* Header file version number */
+/* wglext.h last updated 2005/01/07 */
+/* Current version at http://oss.sgi.com/projects/ogl-sample/registry/ */
+#define WGL_WGLEXT_VERSION 6
+
+#ifndef WGL_ARB_buffer_region
+#define WGL_FRONT_COLOR_BUFFER_BIT_ARB 0x00000001
+#define WGL_BACK_COLOR_BUFFER_BIT_ARB  0x00000002
+#define WGL_DEPTH_BUFFER_BIT_ARB       0x00000004
+#define WGL_STENCIL_BUFFER_BIT_ARB     0x00000008
+#endif
+
+#ifndef WGL_ARB_multisample
+#define WGL_SAMPLE_BUFFERS_ARB         0x2041
+#define WGL_SAMPLES_ARB                0x2042
+#endif
+
+#ifndef WGL_ARB_extensions_string
+#endif
+
+#ifndef WGL_ARB_pixel_format
+#define WGL_NUMBER_PIXEL_FORMATS_ARB   0x2000
+#define WGL_DRAW_TO_WINDOW_ARB         0x2001
+#define WGL_DRAW_TO_BITMAP_ARB         0x2002
+#define WGL_ACCELERATION_ARB           0x2003
+#define WGL_NEED_PALETTE_ARB           0x2004
+#define WGL_NEED_SYSTEM_PALETTE_ARB    0x2005
+#define WGL_SWAP_LAYER_BUFFERS_ARB     0x2006
+#define WGL_SWAP_METHOD_ARB            0x2007
+#define WGL_NUMBER_OVERLAYS_ARB        0x2008
+#define WGL_NUMBER_UNDERLAYS_ARB       0x2009
+#define WGL_TRANSPARENT_ARB            0x200A
+#define WGL_TRANSPARENT_RED_VALUE_ARB  0x2037
+#define WGL_TRANSPARENT_GREEN_VALUE_ARB 0x2038
+#define WGL_TRANSPARENT_BLUE_VALUE_ARB 0x2039
+#define WGL_TRANSPARENT_ALPHA_VALUE_ARB 0x203A
+#define WGL_TRANSPARENT_INDEX_VALUE_ARB 0x203B
+#define WGL_SHARE_DEPTH_ARB            0x200C
+#define WGL_SHARE_STENCIL_ARB          0x200D
+#define WGL_SHARE_ACCUM_ARB            0x200E
+#define WGL_SUPPORT_GDI_ARB            0x200F
+#define WGL_SUPPORT_OPENGL_ARB         0x2010
+#define WGL_DOUBLE_BUFFER_ARB          0x2011
+#define WGL_STEREO_ARB                 0x2012
+#define WGL_PIXEL_TYPE_ARB             0x2013
+#define WGL_COLOR_BITS_ARB             0x2014
+#define WGL_RED_BITS_ARB               0x2015
+#define WGL_RED_SHIFT_ARB              0x2016
+#define WGL_GREEN_BITS_ARB             0x2017
+#define WGL_GREEN_SHIFT_ARB            0x2018
+#define WGL_BLUE_BITS_ARB              0x2019
+#define WGL_BLUE_SHIFT_ARB             0x201A
+#define WGL_ALPHA_BITS_ARB             0x201B
+#define WGL_ALPHA_SHIFT_ARB            0x201C
+#define WGL_ACCUM_BITS_ARB             0x201D
+#define WGL_ACCUM_RED_BITS_ARB         0x201E
+#define WGL_ACCUM_GREEN_BITS_ARB       0x201F
+#define WGL_ACCUM_BLUE_BITS_ARB        0x2020
+#define WGL_ACCUM_ALPHA_BITS_ARB       0x2021
+#define WGL_DEPTH_BITS_ARB             0x2022
+#define WGL_STENCIL_BITS_ARB           0x2023
+#define WGL_AUX_BUFFERS_ARB            0x2024
+#define WGL_NO_ACCELERATION_ARB        0x2025
+#define WGL_GENERIC_ACCELERATION_ARB   0x2026
+#define WGL_FULL_ACCELERATION_ARB      0x2027
+#define WGL_SWAP_EXCHANGE_ARB          0x2028
+#define WGL_SWAP_COPY_ARB              0x2029
+#define WGL_SWAP_UNDEFINED_ARB         0x202A
+#define WGL_TYPE_RGBA_ARB              0x202B
+#define WGL_TYPE_COLORINDEX_ARB        0x202C
+#endif
+
+#ifndef WGL_ARB_make_current_read
+#define ERROR_INVALID_PIXEL_TYPE_ARB   0x2043
+#define ERROR_INCOMPATIBLE_DEVICE_CONTEXTS_ARB 0x2054
+#endif
+
+#ifndef WGL_ARB_pbuffer
+#define WGL_DRAW_TO_PBUFFER_ARB        0x202D
+#define WGL_MAX_PBUFFER_PIXELS_ARB     0x202E
+#define WGL_MAX_PBUFFER_WIDTH_ARB      0x202F
+#define WGL_MAX_PBUFFER_HEIGHT_ARB     0x2030
+#define WGL_PBUFFER_LARGEST_ARB        0x2033
+#define WGL_PBUFFER_WIDTH_ARB          0x2034
+#define WGL_PBUFFER_HEIGHT_ARB         0x2035
+#define WGL_PBUFFER_LOST_ARB           0x2036
+#endif
+
+#ifndef WGL_ARB_render_texture
+#define WGL_BIND_TO_TEXTURE_RGB_ARB    0x2070
+#define WGL_BIND_TO_TEXTURE_RGBA_ARB   0x2071
+#define WGL_TEXTURE_FORMAT_ARB         0x2072
+#define WGL_TEXTURE_TARGET_ARB         0x2073
+#define WGL_MIPMAP_TEXTURE_ARB         0x2074
+#define WGL_TEXTURE_RGB_ARB            0x2075
+#define WGL_TEXTURE_RGBA_ARB           0x2076
+#define WGL_NO_TEXTURE_ARB             0x2077
+#define WGL_TEXTURE_CUBE_MAP_ARB       0x2078
+#define WGL_TEXTURE_1D_ARB             0x2079
+#define WGL_TEXTURE_2D_ARB             0x207A
+#define WGL_MIPMAP_LEVEL_ARB           0x207B
+#define WGL_CUBE_MAP_FACE_ARB          0x207C
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB 0x207D
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB 0x207E
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB 0x207F
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB 0x2080
+#define WGL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB 0x2081
+#define WGL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB 0x2082
+#define WGL_FRONT_LEFT_ARB             0x2083
+#define WGL_FRONT_RIGHT_ARB            0x2084
+#define WGL_BACK_LEFT_ARB              0x2085
+#define WGL_BACK_RIGHT_ARB             0x2086
+#define WGL_AUX0_ARB                   0x2087
+#define WGL_AUX1_ARB                   0x2088
+#define WGL_AUX2_ARB                   0x2089
+#define WGL_AUX3_ARB                   0x208A
+#define WGL_AUX4_ARB                   0x208B
+#define WGL_AUX5_ARB                   0x208C
+#define WGL_AUX6_ARB                   0x208D
+#define WGL_AUX7_ARB                   0x208E
+#define WGL_AUX8_ARB                   0x208F
+#define WGL_AUX9_ARB                   0x2090
+#endif
+
+#ifndef WGL_ARB_pixel_format_float
+#define WGL_TYPE_RGBA_FLOAT_ARB        0x21A0
+#endif
+
+#ifndef WGL_EXT_make_current_read
+#define ERROR_INVALID_PIXEL_TYPE_EXT   0x2043
+#endif
+
+#ifndef WGL_EXT_pixel_format
+#define WGL_NUMBER_PIXEL_FORMATS_EXT   0x2000
+#define WGL_DRAW_TO_WINDOW_EXT         0x2001
+#define WGL_DRAW_TO_BITMAP_EXT         0x2002
+#define WGL_ACCELERATION_EXT           0x2003
+#define WGL_NEED_PALETTE_EXT           0x2004
+#define WGL_NEED_SYSTEM_PALETTE_EXT    0x2005
+#define WGL_SWAP_LAYER_BUFFERS_EXT     0x2006
+#define WGL_SWAP_METHOD_EXT            0x2007
+#define WGL_NUMBER_OVERLAYS_EXT        0x2008
+#define WGL_NUMBER_UNDERLAYS_EXT       0x2009
+#define WGL_TRANSPARENT_EXT            0x200A
+#define WGL_TRANSPARENT_VALUE_EXT      0x200B
+#define WGL_SHARE_DEPTH_EXT            0x200C
+#define WGL_SHARE_STENCIL_EXT          0x200D
+#define WGL_SHARE_ACCUM_EXT            0x200E
+#define WGL_SUPPORT_GDI_EXT            0x200F
+#define WGL_SUPPORT_OPENGL_EXT         0x2010
+#define WGL_DOUBLE_BUFFER_EXT          0x2011
+#define WGL_STEREO_EXT                 0x2012
+#define WGL_PIXEL_TYPE_EXT             0x2013
+#define WGL_COLOR_BITS_EXT             0x2014
+#define WGL_RED_BITS_EXT               0x2015
+#define WGL_RED_SHIFT_EXT              0x2016
+#define WGL_GREEN_BITS_EXT             0x2017
+#define WGL_GREEN_SHIFT_EXT            0x2018
+#define WGL_BLUE_BITS_EXT              0x2019
+#define WGL_BLUE_SHIFT_EXT             0x201A
+#define WGL_ALPHA_BITS_EXT             0x201B
+#define WGL_ALPHA_SHIFT_EXT            0x201C
+#define WGL_ACCUM_BITS_EXT             0x201D
+#define WGL_ACCUM_RED_BITS_EXT         0x201E
+#define WGL_ACCUM_GREEN_BITS_EXT       0x201F
+#define WGL_ACCUM_BLUE_BITS_EXT        0x2020
+#define WGL_ACCUM_ALPHA_BITS_EXT       0x2021
+#define WGL_DEPTH_BITS_EXT             0x2022
+#define WGL_STENCIL_BITS_EXT           0x2023
+#define WGL_AUX_BUFFERS_EXT            0x2024
+#define WGL_NO_ACCELERATION_EXT        0x2025
+#define WGL_GENERIC_ACCELERATION_EXT   0x2026
+#define WGL_FULL_ACCELERATION_EXT      0x2027
+#define WGL_SWAP_EXCHANGE_EXT          0x2028
+#define WGL_SWAP_COPY_EXT              0x2029
+#define WGL_SWAP_UNDEFINED_EXT         0x202A
+#define WGL_TYPE_RGBA_EXT              0x202B
+#define WGL_TYPE_COLORINDEX_EXT        0x202C
+#endif
+
+#ifndef WGL_EXT_pbuffer
+#define WGL_DRAW_TO_PBUFFER_EXT        0x202D
+#define WGL_MAX_PBUFFER_PIXELS_EXT     0x202E
+#define WGL_MAX_PBUFFER_WIDTH_EXT      0x202F
+#define WGL_MAX_PBUFFER_HEIGHT_EXT     0x2030
+#define WGL_OPTIMAL_PBUFFER_WIDTH_EXT  0x2031
+#define WGL_OPTIMAL_PBUFFER_HEIGHT_EXT 0x2032
+#define WGL_PBUFFER_LARGEST_EXT        0x2033
+#define WGL_PBUFFER_WIDTH_EXT          0x2034
+#define WGL_PBUFFER_HEIGHT_EXT         0x2035
+#endif
+
+#ifndef WGL_EXT_depth_float
+#define WGL_DEPTH_FLOAT_EXT            0x2040
+#endif
+
+#ifndef WGL_3DFX_multisample
+#define WGL_SAMPLE_BUFFERS_3DFX        0x2060
+#define WGL_SAMPLES_3DFX               0x2061
+#endif
+
+#ifndef WGL_EXT_multisample
+#define WGL_SAMPLE_BUFFERS_EXT         0x2041
+#define WGL_SAMPLES_EXT                0x2042
+#endif
+
+#ifndef WGL_I3D_digital_video_control
+#define WGL_DIGITAL_VIDEO_CURSOR_ALPHA_FRAMEBUFFER_I3D 0x2050
+#define WGL_DIGITAL_VIDEO_CURSOR_ALPHA_VALUE_I3D 0x2051
+#define WGL_DIGITAL_VIDEO_CURSOR_INCLUDED_I3D 0x2052
+#define WGL_DIGITAL_VIDEO_GAMMA_CORRECTED_I3D 0x2053
+#endif
+
+#ifndef WGL_I3D_gamma
+#define WGL_GAMMA_TABLE_SIZE_I3D       0x204E
+#define WGL_GAMMA_EXCLUDE_DESKTOP_I3D  0x204F
+#endif
+
+#ifndef WGL_I3D_genlock
+#define WGL_GENLOCK_SOURCE_MULTIVIEW_I3D 0x2044
+#define WGL_GENLOCK_SOURCE_EXTENAL_SYNC_I3D 0x2045
+#define WGL_GENLOCK_SOURCE_EXTENAL_FIELD_I3D 0x2046
+#define WGL_GENLOCK_SOURCE_EXTENAL_TTL_I3D 0x2047
+#define WGL_GENLOCK_SOURCE_DIGITAL_SYNC_I3D 0x2048
+#define WGL_GENLOCK_SOURCE_DIGITAL_FIELD_I3D 0x2049
+#define WGL_GENLOCK_SOURCE_EDGE_FALLING_I3D 0x204A
+#define WGL_GENLOCK_SOURCE_EDGE_RISING_I3D 0x204B
+#define WGL_GENLOCK_SOURCE_EDGE_BOTH_I3D 0x204C
+#endif
+
+#ifndef WGL_I3D_image_buffer
+#define WGL_IMAGE_BUFFER_MIN_ACCESS_I3D 0x00000001
+#define WGL_IMAGE_BUFFER_LOCK_I3D      0x00000002
+#endif
+
+#ifndef WGL_I3D_swap_frame_lock
+#endif
+
+#ifndef WGL_NV_render_depth_texture
+#define WGL_BIND_TO_TEXTURE_DEPTH_NV   0x20A3
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_DEPTH_NV 0x20A4
+#define WGL_DEPTH_TEXTURE_FORMAT_NV    0x20A5
+#define WGL_TEXTURE_DEPTH_COMPONENT_NV 0x20A6
+#define WGL_DEPTH_COMPONENT_NV         0x20A7
+#endif
+
+#ifndef WGL_NV_render_texture_rectangle
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_RGB_NV 0x20A0
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_RGBA_NV 0x20A1
+#define WGL_TEXTURE_RECTANGLE_NV       0x20A2
+#endif
+
+#ifndef WGL_ATI_pixel_format_float
+#define WGL_TYPE_RGBA_FLOAT_ATI        0x21A0
+#define WGL_RGBA_FLOAT_MODE_ATI        0x8820
+#define WGL_COLOR_CLEAR_UNCLAMPED_VALUE_ATI 0x8835
+#endif
+
+#ifndef WGL_NV_float_buffer
+#define WGL_FLOAT_COMPONENTS_NV        0x20B0
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_R_NV 0x20B1
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RG_NV 0x20B2
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RGB_NV 0x20B3
+#define WGL_BIND_TO_TEXTURE_RECTANGLE_FLOAT_RGBA_NV 0x20B4
+#define WGL_TEXTURE_FLOAT_R_NV         0x20B5
+#define WGL_TEXTURE_FLOAT_RG_NV        0x20B6
+#define WGL_TEXTURE_FLOAT_RGB_NV       0x20B7
+#define WGL_TEXTURE_FLOAT_RGBA_NV      0x20B8
+#endif
+
+#ifndef WGL_NV_swap_group
+#endif
+
+#ifndef WGL_NV_gpu_affinity
+#define WGL_ERROR_INCOMPATIBLE_AFFINITY_MASKS_NV 0x20D0
+#define WGL_ERROR_MISSING_AFFINITY_MASK_NV 0x20D1
+#endif
+
+
+/*************************************************************/
+
+#ifndef WGL_ARB_pbuffer
+DECLARE_HANDLE(HPBUFFERARB);
+#endif
+#ifndef WGL_EXT_pbuffer
+DECLARE_HANDLE(HPBUFFEREXT);
+#endif
+
+#ifndef WGL_NV_gpu_affinity
+DECLARE_HANDLE(HGPUNV);
+typedef struct _GPU_DEVICE
+{
+    DWORD  cb;
+    CHAR   DeviceName[32];
+    CHAR   DeviceString[128];
+    DWORD  Flags;
+    RECT   rcVirtualScreen;
+} GPU_DEVICE, *PGPU_DEVICE;
+#endif
+
+#ifndef WGL_ARB_buffer_region
+#define WGL_ARB_buffer_region 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern HANDLE WINAPI wglCreateBufferRegionARB(HDC, int, UINT);
+extern VOID WINAPI wglDeleteBufferRegionARB(HANDLE);
+extern BOOL WINAPI wglSaveBufferRegionARB(HANDLE, int, int, int, int);
+extern BOOL WINAPI wglRestoreBufferRegionARB(HANDLE, int, int, int, int, int, int);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef HANDLE(WINAPI *PFNWGLCREATEBUFFERREGIONARBPROC)(HDC hDC, int iLayerPlane, UINT uType);
+typedef VOID (WINAPI *PFNWGLDELETEBUFFERREGIONARBPROC)(HANDLE hRegion);
+typedef BOOL (WINAPI *PFNWGLSAVEBUFFERREGIONARBPROC)(HANDLE hRegion, int x, int y, int width, int height);
+typedef BOOL (WINAPI *PFNWGLRESTOREBUFFERREGIONARBPROC)(HANDLE hRegion, int x, int y, int width, int height, int xSrc, int ySrc);
+#endif
+
+#ifndef WGL_ARB_multisample
+#define WGL_ARB_multisample 1
+#endif
+
+#ifndef WGL_ARB_extensions_string
+#define WGL_ARB_extensions_string 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern const char *WINAPI wglGetExtensionsStringARB(HDC);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef const char *(WINAPI *PFNWGLGETEXTENSIONSSTRINGARBPROC)(HDC hdc);
+#endif
+
+#ifndef WGL_ARB_pixel_format
+#define WGL_ARB_pixel_format 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetPixelFormatAttribivARB(HDC, int, int, UINT, const int *, int *);
+extern BOOL WINAPI wglGetPixelFormatAttribfvARB(HDC, int, int, UINT, const int *, FLOAT *);
+extern BOOL WINAPI wglChoosePixelFormatARB(HDC, const int *, const FLOAT *, UINT, int *, UINT *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBIVARBPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, int *piValues);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBFVARBPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, const int *piAttributes, FLOAT *pfValues);
+typedef BOOL (WINAPI *PFNWGLCHOOSEPIXELFORMATARBPROC)(HDC hdc, const int *piAttribIList, const FLOAT *pfAttribFList, UINT nMaxFormats, int *piFormats, UINT *nNumFormats);
+#endif
+
+#ifndef WGL_ARB_make_current_read
+#define WGL_ARB_make_current_read 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglMakeContextCurrentARB(HDC, HDC, HGLRC);
+extern HDC WINAPI wglGetCurrentReadDCARB(void);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLMAKECONTEXTCURRENTARBPROC)(HDC hDrawDC, HDC hReadDC, HGLRC hglrc);
+typedef HDC(WINAPI *PFNWGLGETCURRENTREADDCARBPROC)(void);
+#endif
+
+#ifndef WGL_ARB_pbuffer
+#define WGL_ARB_pbuffer 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern HPBUFFERARB WINAPI wglCreatePbufferARB(HDC, int, int, int, const int *);
+extern HDC WINAPI wglGetPbufferDCARB(HPBUFFERARB);
+extern int WINAPI wglReleasePbufferDCARB(HPBUFFERARB, HDC);
+extern BOOL WINAPI wglDestroyPbufferARB(HPBUFFERARB);
+extern BOOL WINAPI wglQueryPbufferARB(HPBUFFERARB, int, int *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef HPBUFFERARB(WINAPI *PFNWGLCREATEPBUFFERARBPROC)(HDC hDC, int iPixelFormat, int iWidth, int iHeight, const int *piAttribList);
+typedef HDC(WINAPI *PFNWGLGETPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer);
+typedef int (WINAPI *PFNWGLRELEASEPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer, HDC hDC);
+typedef BOOL (WINAPI *PFNWGLDESTROYPBUFFERARBPROC)(HPBUFFERARB hPbuffer);
+typedef BOOL (WINAPI *PFNWGLQUERYPBUFFERARBPROC)(HPBUFFERARB hPbuffer, int iAttribute, int *piValue);
+#endif
+
+#ifndef WGL_ARB_render_texture
+#define WGL_ARB_render_texture 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglBindTexImageARB(HPBUFFERARB, int);
+extern BOOL WINAPI wglReleaseTexImageARB(HPBUFFERARB, int);
+extern BOOL WINAPI wglSetPbufferAttribARB(HPBUFFERARB, const int *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLBINDTEXIMAGEARBPROC)(HPBUFFERARB hPbuffer, int iBuffer);
+typedef BOOL (WINAPI *PFNWGLRELEASETEXIMAGEARBPROC)(HPBUFFERARB hPbuffer, int iBuffer);
+typedef BOOL (WINAPI *PFNWGLSETPBUFFERATTRIBARBPROC)(HPBUFFERARB hPbuffer, const int *piAttribList);
+#endif
+
+#ifndef WGL_ARB_pixel_format_float
+#define WGL_ARB_pixel_format_float 1
+#endif
+
+#ifndef WGL_EXT_display_color_table
+#define WGL_EXT_display_color_table 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern GLboolean WINAPI wglCreateDisplayColorTableEXT(GLushort);
+extern GLboolean WINAPI wglLoadDisplayColorTableEXT(const GLushort *, GLuint);
+extern GLboolean WINAPI wglBindDisplayColorTableEXT(GLushort);
+extern VOID WINAPI wglDestroyDisplayColorTableEXT(GLushort);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef GLboolean(WINAPI *PFNWGLCREATEDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+typedef GLboolean(WINAPI *PFNWGLLOADDISPLAYCOLORTABLEEXTPROC)(const GLushort *table, GLuint length);
+typedef GLboolean(WINAPI *PFNWGLBINDDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+typedef VOID (WINAPI *PFNWGLDESTROYDISPLAYCOLORTABLEEXTPROC)(GLushort id);
+#endif
+
+#ifndef WGL_EXT_extensions_string
+#define WGL_EXT_extensions_string 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern const char *WINAPI wglGetExtensionsStringEXT(void);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef const char *(WINAPI *PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
+#endif
+
+#ifndef WGL_EXT_make_current_read
+#define WGL_EXT_make_current_read 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglMakeContextCurrentEXT(HDC, HDC, HGLRC);
+extern HDC WINAPI wglGetCurrentReadDCEXT(void);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLMAKECONTEXTCURRENTEXTPROC)(HDC hDrawDC, HDC hReadDC, HGLRC hglrc);
+typedef HDC(WINAPI *PFNWGLGETCURRENTREADDCEXTPROC)(void);
+#endif
+
+#ifndef WGL_EXT_pbuffer
+#define WGL_EXT_pbuffer 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern HPBUFFEREXT WINAPI wglCreatePbufferEXT(HDC, int, int, int, const int *);
+extern HDC WINAPI wglGetPbufferDCEXT(HPBUFFEREXT);
+extern int WINAPI wglReleasePbufferDCEXT(HPBUFFEREXT, HDC);
+extern BOOL WINAPI wglDestroyPbufferEXT(HPBUFFEREXT);
+extern BOOL WINAPI wglQueryPbufferEXT(HPBUFFEREXT, int, int *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef HPBUFFEREXT(WINAPI *PFNWGLCREATEPBUFFEREXTPROC)(HDC hDC, int iPixelFormat, int iWidth, int iHeight, const int *piAttribList);
+typedef HDC(WINAPI *PFNWGLGETPBUFFERDCEXTPROC)(HPBUFFEREXT hPbuffer);
+typedef int (WINAPI *PFNWGLRELEASEPBUFFERDCEXTPROC)(HPBUFFEREXT hPbuffer, HDC hDC);
+typedef BOOL (WINAPI *PFNWGLDESTROYPBUFFEREXTPROC)(HPBUFFEREXT hPbuffer);
+typedef BOOL (WINAPI *PFNWGLQUERYPBUFFEREXTPROC)(HPBUFFEREXT hPbuffer, int iAttribute, int *piValue);
+#endif
+
+#ifndef WGL_EXT_pixel_format
+#define WGL_EXT_pixel_format 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetPixelFormatAttribivEXT(HDC, int, int, UINT, int *, int *);
+extern BOOL WINAPI wglGetPixelFormatAttribfvEXT(HDC, int, int, UINT, int *, FLOAT *);
+extern BOOL WINAPI wglChoosePixelFormatEXT(HDC, const int *, const FLOAT *, UINT, int *, UINT *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBIVEXTPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, int *piAttributes, int *piValues);
+typedef BOOL (WINAPI *PFNWGLGETPIXELFORMATATTRIBFVEXTPROC)(HDC hdc, int iPixelFormat, int iLayerPlane, UINT nAttributes, int *piAttributes, FLOAT *pfValues);
+typedef BOOL (WINAPI *PFNWGLCHOOSEPIXELFORMATEXTPROC)(HDC hdc, const int *piAttribIList, const FLOAT *pfAttribFList, UINT nMaxFormats, int *piFormats, UINT *nNumFormats);
+#endif
+
+#ifndef WGL_EXT_swap_control
+#define WGL_EXT_swap_control 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglSwapIntervalEXT(int);
+extern int WINAPI wglGetSwapIntervalEXT(void);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLSWAPINTERVALEXTPROC)(int interval);
+typedef int (WINAPI *PFNWGLGETSWAPINTERVALEXTPROC)(void);
+#endif
+
+#ifndef WGL_EXT_depth_float
+#define WGL_EXT_depth_float 1
+#endif
+
+#ifndef WGL_NV_vertex_array_range
+#define WGL_NV_vertex_array_range 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern void *WINAPI wglAllocateMemoryNV(GLsizei, GLfloat, GLfloat, GLfloat);
+extern void WINAPI wglFreeMemoryNV(void *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef void *(WINAPI *PFNWGLALLOCATEMEMORYNVPROC)(GLsizei size, GLfloat readfreq, GLfloat writefreq, GLfloat priority);
+typedef void (WINAPI *PFNWGLFREEMEMORYNVPROC)(void *pointer);
+#endif
+
+#ifndef WGL_3DFX_multisample
+#define WGL_3DFX_multisample 1
+#endif
+
+#ifndef WGL_EXT_multisample
+#define WGL_EXT_multisample 1
+#endif
+
+#ifndef WGL_OML_sync_control
+#define WGL_OML_sync_control 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetSyncValuesOML(HDC, INT64 *, INT64 *, INT64 *);
+extern BOOL WINAPI wglGetMscRateOML(HDC, INT32 *, INT32 *);
+extern INT64 WINAPI wglSwapBuffersMscOML(HDC, INT64, INT64, INT64);
+extern INT64 WINAPI wglSwapLayerBuffersMscOML(HDC, int, INT64, INT64, INT64);
+extern BOOL WINAPI wglWaitForMscOML(HDC, INT64, INT64, INT64, INT64 *, INT64 *, INT64 *);
+extern BOOL WINAPI wglWaitForSbcOML(HDC, INT64, INT64 *, INT64 *, INT64 *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETSYNCVALUESOMLPROC)(HDC hdc, INT64 *ust, INT64 *msc, INT64 *sbc);
+typedef BOOL (WINAPI *PFNWGLGETMSCRATEOMLPROC)(HDC hdc, INT32 *numerator, INT32 *denominator);
+typedef INT64(WINAPI *PFNWGLSWAPBUFFERSMSCOMLPROC)(HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder);
+typedef INT64(WINAPI *PFNWGLSWAPLAYERBUFFERSMSCOMLPROC)(HDC hdc, int fuPlanes, INT64 target_msc, INT64 divisor, INT64 remainder);
+typedef BOOL (WINAPI *PFNWGLWAITFORMSCOMLPROC)(HDC hdc, INT64 target_msc, INT64 divisor, INT64 remainder, INT64 *ust, INT64 *msc, INT64 *sbc);
+typedef BOOL (WINAPI *PFNWGLWAITFORSBCOMLPROC)(HDC hdc, INT64 target_sbc, INT64 *ust, INT64 *msc, INT64 *sbc);
+#endif
+
+#ifndef WGL_I3D_digital_video_control
+#define WGL_I3D_digital_video_control 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetDigitalVideoParametersI3D(HDC, int, int *);
+extern BOOL WINAPI wglSetDigitalVideoParametersI3D(HDC, int, const int *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETDIGITALVIDEOPARAMETERSI3DPROC)(HDC hDC, int iAttribute, int *piValue);
+typedef BOOL (WINAPI *PFNWGLSETDIGITALVIDEOPARAMETERSI3DPROC)(HDC hDC, int iAttribute, const int *piValue);
+#endif
+
+#ifndef WGL_I3D_gamma
+#define WGL_I3D_gamma 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetGammaTableParametersI3D(HDC, int, int *);
+extern BOOL WINAPI wglSetGammaTableParametersI3D(HDC, int, const int *);
+extern BOOL WINAPI wglGetGammaTableI3D(HDC, int, USHORT *, USHORT *, USHORT *);
+extern BOOL WINAPI wglSetGammaTableI3D(HDC, int, const USHORT *, const USHORT *, const USHORT *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETGAMMATABLEPARAMETERSI3DPROC)(HDC hDC, int iAttribute, int *piValue);
+typedef BOOL (WINAPI *PFNWGLSETGAMMATABLEPARAMETERSI3DPROC)(HDC hDC, int iAttribute, const int *piValue);
+typedef BOOL (WINAPI *PFNWGLGETGAMMATABLEI3DPROC)(HDC hDC, int iEntries, USHORT *puRed, USHORT *puGreen, USHORT *puBlue);
+typedef BOOL (WINAPI *PFNWGLSETGAMMATABLEI3DPROC)(HDC hDC, int iEntries, const USHORT *puRed, const USHORT *puGreen, const USHORT *puBlue);
+#endif
+
+#ifndef WGL_I3D_genlock
+#define WGL_I3D_genlock 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglEnableGenlockI3D(HDC);
+extern BOOL WINAPI wglDisableGenlockI3D(HDC);
+extern BOOL WINAPI wglIsEnabledGenlockI3D(HDC, BOOL *);
+extern BOOL WINAPI wglGenlockSourceI3D(HDC, UINT);
+extern BOOL WINAPI wglGetGenlockSourceI3D(HDC, UINT *);
+extern BOOL WINAPI wglGenlockSourceEdgeI3D(HDC, UINT);
+extern BOOL WINAPI wglGetGenlockSourceEdgeI3D(HDC, UINT *);
+extern BOOL WINAPI wglGenlockSampleRateI3D(HDC, UINT);
+extern BOOL WINAPI wglGetGenlockSampleRateI3D(HDC, UINT *);
+extern BOOL WINAPI wglGenlockSourceDelayI3D(HDC, UINT);
+extern BOOL WINAPI wglGetGenlockSourceDelayI3D(HDC, UINT *);
+extern BOOL WINAPI wglQueryGenlockMaxSourceDelayI3D(HDC, UINT *, UINT *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLENABLEGENLOCKI3DPROC)(HDC hDC);
+typedef BOOL (WINAPI *PFNWGLDISABLEGENLOCKI3DPROC)(HDC hDC);
+typedef BOOL (WINAPI *PFNWGLISENABLEDGENLOCKI3DPROC)(HDC hDC, BOOL *pFlag);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEI3DPROC)(HDC hDC, UINT uSource);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEI3DPROC)(HDC hDC, UINT *uSource);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEEDGEI3DPROC)(HDC hDC, UINT uEdge);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEEDGEI3DPROC)(HDC hDC, UINT *uEdge);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSAMPLERATEI3DPROC)(HDC hDC, UINT uRate);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSAMPLERATEI3DPROC)(HDC hDC, UINT *uRate);
+typedef BOOL (WINAPI *PFNWGLGENLOCKSOURCEDELAYI3DPROC)(HDC hDC, UINT uDelay);
+typedef BOOL (WINAPI *PFNWGLGETGENLOCKSOURCEDELAYI3DPROC)(HDC hDC, UINT *uDelay);
+typedef BOOL (WINAPI *PFNWGLQUERYGENLOCKMAXSOURCEDELAYI3DPROC)(HDC hDC, UINT *uMaxLineDelay, UINT *uMaxPixelDelay);
+#endif
+
+#ifndef WGL_I3D_image_buffer
+#define WGL_I3D_image_buffer 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern LPVOID WINAPI wglCreateImageBufferI3D(HDC, DWORD, UINT);
+extern BOOL WINAPI wglDestroyImageBufferI3D(HDC, LPVOID);
+extern BOOL WINAPI wglAssociateImageBufferEventsI3D(HDC, const HANDLE *, const LPVOID *, const DWORD *, UINT);
+extern BOOL WINAPI wglReleaseImageBufferEventsI3D(HDC, const LPVOID *, UINT);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef LPVOID (WINAPI *PFNWGLCREATEIMAGEBUFFERI3DPROC)(HDC hDC, DWORD dwSize, UINT uFlags);
+typedef BOOL (WINAPI *PFNWGLDESTROYIMAGEBUFFERI3DPROC)(HDC hDC, LPVOID pAddress);
+typedef BOOL (WINAPI *PFNWGLASSOCIATEIMAGEBUFFEREVENTSI3DPROC)(HDC hDC, const HANDLE *pEvent, const LPVOID *pAddress, const DWORD *pSize, UINT count);
+typedef BOOL (WINAPI *PFNWGLRELEASEIMAGEBUFFEREVENTSI3DPROC)(HDC hDC, const LPVOID *pAddress, UINT count);
+#endif
+
+#ifndef WGL_I3D_swap_frame_lock
+#define WGL_I3D_swap_frame_lock 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglEnableFrameLockI3D(void);
+extern BOOL WINAPI wglDisableFrameLockI3D(void);
+extern BOOL WINAPI wglIsEnabledFrameLockI3D(BOOL *);
+extern BOOL WINAPI wglQueryFrameLockMasterI3D(BOOL *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLENABLEFRAMELOCKI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLDISABLEFRAMELOCKI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLISENABLEDFRAMELOCKI3DPROC)(BOOL *pFlag);
+typedef BOOL (WINAPI *PFNWGLQUERYFRAMELOCKMASTERI3DPROC)(BOOL *pFlag);
+#endif
+
+#ifndef WGL_I3D_swap_frame_usage
+#define WGL_I3D_swap_frame_usage 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglGetFrameUsageI3D(float *);
+extern BOOL WINAPI wglBeginFrameTrackingI3D(void);
+extern BOOL WINAPI wglEndFrameTrackingI3D(void);
+extern BOOL WINAPI wglQueryFrameTrackingI3D(DWORD *, DWORD *, float *);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLGETFRAMEUSAGEI3DPROC)(float *pUsage);
+typedef BOOL (WINAPI *PFNWGLBEGINFRAMETRACKINGI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLENDFRAMETRACKINGI3DPROC)(void);
+typedef BOOL (WINAPI *PFNWGLQUERYFRAMETRACKINGI3DPROC)(DWORD *pFrameCount, DWORD *pMissedFrames, float *pLastMissedUsage);
+#endif
+
+#ifndef WGL_ATI_pixel_format_float
+#define WGL_ATI_pixel_format_float 1
+#endif
+
+#ifndef WGL_NV_render_depth_texture
+#define WGL_NV_render_depth_texture 1
+#endif
+
+#ifndef WGL_NV_render_texture_rectangle
+#define WGL_NV_render_texture_rectangle 1
+#endif
+
+#ifndef WGL_NV_float_buffer
+#define WGL_NV_float_buffer 1
+#endif
+
+#ifndef WGL_NV_swap_group
+#define WGL_NV_swap_group 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglJoinSwapGroupNV(HDC hDC, GLuint group);
+extern BOOL WINAPI wglBindSwapBarrierNV(GLuint group, GLuint barrier);
+extern BOOL WINAPI wglQuerySwapGroupNV(HDC hDC, GLuint *group, GLuint *barrier);
+extern BOOL WINAPI wglQueryMaxSwapGroupsNV(HDC hDC, GLuint *maxGroups, GLuint *maxBarriers);
+extern BOOL WINAPI wglQueryFrameCountNV(HDC hDC, GLuint *count);
+extern BOOL WINAPI wglResetFrameCountNV(HDC hDC);
+#endif /* WGL_WGLEXT_PROTOTYPES */
+typedef BOOL (WINAPI *PFNWGLJOINSWAPGROUPNVPROC)(HDC hDC, GLuint group);
+typedef BOOL (WINAPI *PFNWGLBINDSWAPBARRIERNVPROC)(GLuint group, GLuint barrier);
+typedef BOOL (WINAPI *PFNWGLQUERYSWAPGROUPNVPROC)(HDC hDC, GLuint *group, GLuint *barrier);
+typedef BOOL (WINAPI *PFNWGLQUERYMAXSWAPGROUPSNVPROC)(HDC hDC, GLuint *maxGroups, GLuint *maxBarriers);
+typedef BOOL (WINAPI *PFNWGLQUERYFRAMECOUNTNVPROC)(HDC hDC, GLuint *count);
+typedef BOOL (WINAPI *PFNWGLRESETFRAMECOUNTNVPROC)(HDC hDC);
+#endif
+
+#ifndef WGL_NV_gpu_affinity
+#define WGL_NV_gpu_affinity 1
+#ifdef WGL_WGLEXT_PROTOTYPES
+extern BOOL WINAPI wglEnumGpusNV(UINT iIndex, HGPUNV *hGpu);
+extern BOOL WINAPI wglEnumGpuDevicesNV(HGPUNV hGpu, UINT iIndex, PGPU_DEVICE pGpuDevice);
+extern HDC WINAPI wglCreateAffinityDCNV(const HGPUNV *pGpuList);
+extern BOOL WINAPI wglEnumGpusFromAffinityDCNV(HDC hAffinityDC, UINT iIndex, HGPUNV *hGpu);
+extern BOOL WINAPI wglDeleteDCNV(HDC hAffinityDC);
+#else
+typedef BOOL (WINAPI *PFNWGLENUMGPUSNVPROC)(UINT iIndex, HGPUNV *hGpu);
+typedef BOOL (WINAPI *PFNWGLENUMGPUDEVICESNVPROC)(HGPUNV hGpu, UINT iIndex, PGPU_DEVICE pGpuDevice);
+typedef HDC(WINAPI *PFNWGLCREATEAFFINITYDCNVPROC)(const HGPUNV *pGpuList);
+typedef BOOL (WINAPI *PFNWGLENUMGPUSFROMAFFINITYDCNVPROC)(HDC hAffinityDC, UINT iIndex, HGPUNV *hGpu);
+typedef BOOL (WINAPI *PFNWGLDELETEDCNVPROC)(HDC hAffinityDC);
+#endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 197 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Exceptions.h

@@ -0,0 +1,197 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_EXCEPTIONS_H
+#define NV_UTIL_NPP_EXCEPTIONS_H
+
+
+#include <string>
+#include <sstream>
+#include <iostream>
+
+/// All npp related C++ classes are put into the npp namespace.
+namespace npp
+{
+
+    /// Exception base class.
+    ///     This exception base class will be used for everything C++ throught
+    /// the NPP project.
+    ///     The exception contains a string message, as well as data fields for a string
+    /// containing the name of the file as well as the line number where the exception was thrown.
+    ///     The easiest way of throwing exceptions and providing filename and line number is
+    /// to use one of the ASSERT macros defined for that purpose.
+    class Exception
+    {
+        public:
+            /// Constructor.
+            /// \param rMessage A message with information as to why the exception was thrown.
+            /// \param rFileName The name of the file where the exception was thrown.
+            /// \param nLineNumber Line number in the file where the exception was thrown.
+            explicit
+            Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0)
+                : sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber)
+            { };
+
+            Exception(const Exception &rException)
+                : sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_)
+            { };
+
+            virtual
+            ~Exception()
+            { };
+
+            /// Get the exception's message.
+            const
+            std::string &
+            message()
+            const
+            {
+                return sMessage_;
+            }
+
+            /// Get the exception's file info.
+            const
+            std::string &
+            fileName()
+            const
+            {
+                return sFileName_;
+            }
+
+            /// Get the exceptions's line info.
+            unsigned int
+            lineNumber()
+            const
+            {
+                return nLineNumber_;
+            }
+
+
+            /// Create a clone of this exception.
+            ///      This creates a new Exception object on the heap. It is
+            /// the responsibility of the user of this function to free this memory
+            /// (delete x).
+            virtual
+            Exception *
+            clone()
+            const
+            {
+                return new Exception(*this);
+            }
+
+            /// Create a single string with all the exceptions information.
+            ///     The virtual toString() method is used by the operator<<()
+            /// so that all exceptions derived from this base-class can print
+            /// their full information correctly even if a reference to their
+            /// exact type is not had at the time of printing (i.e. the basic
+            /// operator<<() is used).
+            virtual
+            std::string
+            toString()
+            const
+            {
+                std::ostringstream oOutputString;
+                oOutputString << fileName() << ":" << lineNumber() << ": " << message();
+                return oOutputString.str();
+            }
+
+        private:
+            std::string sMessage_;      ///< Message regarding the cause of the exception.
+            std::string sFileName_;     ///< Name of the file where the exception was thrown.
+            unsigned int nLineNumber_;  ///< Line number in the file where the exception was thrown
+    };
+
+    /// Output stream inserter for Exception.
+    /// \param rOutputStream The stream the exception information is written to.
+    /// \param rException The exception that's being written.
+    /// \return Reference to the output stream being used.
+    std::ostream &
+    operator << (std::ostream &rOutputStream, const Exception &rException)
+    {
+        rOutputStream << rException.toString();
+        return rOutputStream;
+    }
+
+    /// Basic assert macro.
+    ///     This macro should be used to enforce any kind of pre or post conditions.
+    /// Unlike the C-runtime assert macro, this macro does not abort execution, but throws
+    /// a C++ exception. The exception is automatically filled with information about the failing
+    /// condition, the filename and line number where the exception was thrown.
+    /// \note The macro is written in such a way that omitting a semicolon after its usage
+    ///     causes a compiler error. The correct way to invoke this macro is:
+    /// NPP_ASSERT(n < MAX);
+#define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false)
+
+    // ASSERT macro.
+    //  Same functionality as the basic assert macro with the added ability to pass
+    //  a message M. M should be a string literal.
+    //  Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled
+    //      out in release mode.
+#define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false)
+
+#ifdef _DEBUG
+    /// Basic debug assert macro.
+    ///     This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a
+    /// no-op in release builds. It is therefor of utmost importance to not put statements into
+    /// this macro that cause side effects required for correct program execution.
+#define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false)
+#else
+#define NPP_DEBUG_ASSERT(C)
+#endif
+
+    /// ASSERT for null-pointer test.
+    /// It is safe to put code with side effects into this macro. Also: This macro never
+    /// gets compiled to a no-op because resource allocation may fail based on external causes not under
+    /// control of a software developer.
+#define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false)
+
+    /// Macro for flagging methods as not implemented.
+    /// The macro throws an exception with a message that an implementation was missing
+#define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false)
+
+    /// Macro for checking error return code of CUDA (runtime) calls.
+    /// This macro never gets disabled.
+#define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \
+        eCUDAResult = S; \
+        if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \
+        NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false)
+
+    /// Macro for checking error return code for NPP calls.
+#define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \
+        eStatusNPP = S; \
+        if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \
+        NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false)
+
+    /// Macro for checking error return codes from cuFFT calls.
+#define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \
+        eCUFFTResult = S; \
+        if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \
+        NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false)
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_EXCEPTIONS_H

+ 155 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Image.h

@@ -0,0 +1,155 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_IMAGE_H
+#define NV_UTIL_NPP_IMAGE_H
+
+#include <cstddef>
+
+namespace npp
+{
+
+    class Image
+    {
+        public:
+            struct Size
+            {
+                unsigned int nWidth;
+                unsigned int nHeight;
+
+                Size() : nWidth(0), nHeight(0)
+                { };
+
+                Size(unsigned int nWidthNew, unsigned nHeightNew) : nWidth(nWidthNew), nHeight(nHeightNew)
+                { };
+
+                Size(const Size &rSize) : nWidth(rSize.nWidth), nHeight(rSize.nHeight)
+                { };
+
+                Size &
+                operator= (const Size &rSize)
+                {
+                    if (&rSize == this)
+                    {
+                        return *this;
+                    }
+
+                    nWidth = rSize.nWidth;
+                    nHeight = rSize.nHeight;
+
+                    return *this;
+                }
+
+                void
+                swap(Size &rSize)
+                {
+                    unsigned int nTemp;
+                    nTemp = nWidth;
+                    nWidth = rSize.nWidth;
+                    rSize.nWidth = nTemp;
+
+                    nTemp = nHeight;
+                    nHeight = rSize.nHeight;
+                    rSize.nHeight = nTemp;
+                }
+            };
+
+            Image()
+            { };
+
+            Image(unsigned int nWidth, unsigned int nHeight) : oSize_(nWidth, nHeight)
+            { };
+
+            Image(const Image::Size &rSize) : oSize_(rSize)
+            { };
+
+            Image(const Image &rImage) : oSize_(rImage.oSize_)
+            { };
+
+            virtual
+            ~Image()
+            { };
+
+            Image &
+            operator= (const Image &rImage)
+            {
+                if (&rImage == this)
+                {
+                    return *this;
+                }
+
+                oSize_  = rImage.oSize_;
+                return *this;
+            };
+
+            unsigned int
+            width()
+            const
+            {
+                return oSize_.nWidth;
+            }
+
+            unsigned int
+            height()
+            const
+            {
+                return oSize_.nHeight;
+            }
+
+            Size
+            size()
+            const
+            {
+                return oSize_;
+            }
+
+            void
+            swap(Image &rImage)
+            {
+                oSize_.swap(rImage.oSize_);
+            }
+
+        private:
+            Size oSize_;
+    };
+
+    bool
+    operator== (const Image::Size &rFirst, const Image::Size &rSecond)
+    {
+        return rFirst.nWidth == rSecond.nWidth && rFirst.nHeight == rSecond.nHeight;
+    }
+
+    bool
+    operator!= (const Image::Size &rFirst, const Image::Size &rSecond)
+    {
+        return rFirst.nWidth != rSecond.nWidth || rFirst.nHeight != rSecond.nHeight;
+    }
+
+} // npp namespace
+
+
+#endif // NV_UTIL_NPP_IMAGE_H

+ 80 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsCPU.h

@@ -0,0 +1,80 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
+#define NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
+
+#include "Exceptions.h"
+
+namespace npp
+{
+
+    template <typename D, size_t N>
+    class ImageAllocatorCPU
+    {
+        public:
+            static
+            D *
+            Malloc2D(unsigned int nWidth, unsigned int nHeight, unsigned int *pPitch)
+            {
+                NPP_ASSERT(nWidth * nHeight > 0);
+
+                D *pResult = new D[nWidth * N * nHeight];
+                *pPitch = nWidth * sizeof(D) * N;
+
+                return pResult;
+            };
+
+            static
+            void
+            Free2D(D *pPixels)
+            {
+                delete[] pPixels;
+            };
+
+            static
+            void
+            Copy2D(D *pDst, size_t nDstPitch, const D *pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight)
+            {
+                const void *pSrcLine = pSrc;
+                void        *pDstLine = pDst;
+
+                for (size_t iLine = 0; iLine < nHeight; ++iLine)
+                {
+                    // copy one line worth of data
+                    memcpy(pDst, pSrc, nWidth * N * sizeof(D));
+                    // move data pointers to next line
+                    pDst += nDstPitch;
+                    pSrc += nSrcPitch;
+                }
+            };
+
+    };
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H

File diff suppressed because it is too large
+ 1139 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageAllocatorsNPP.h


+ 149 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImageIO.h

@@ -0,0 +1,149 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_IMAGE_IO_H
+#define NV_UTIL_NPP_IMAGE_IO_H
+
+#include "ImagesCPU.h"
+#include "ImagesNPP.h"
+
+#include "FreeImage.h"
+#include "Exceptions.h"
+
+#include <string>
+#include "string.h"
+
+
+// Error handler for FreeImage library.
+//  In case this handler is invoked, it throws an NPP exception.
+void
+FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char *zMessage)
+{
+    throw npp::Exception(zMessage);
+}
+
+namespace npp
+{
+    // Load a gray-scale image from disk.
+    void
+    loadImage(const std::string &rFileName, ImageCPU_8u_C1 &rImage)
+    {
+        // set your own FreeImage error handler
+        FreeImage_SetOutputMessage(FreeImageErrorHandler);
+
+        FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(rFileName.c_str());
+
+        // no signature? try to guess the file format from the file extension
+        if (eFormat == FIF_UNKNOWN)
+        {
+            eFormat = FreeImage_GetFIFFromFilename(rFileName.c_str());
+        }
+
+        NPP_ASSERT(eFormat != FIF_UNKNOWN);
+        // check that the plugin has reading capabilities ...
+        FIBITMAP *pBitmap;
+
+        if (FreeImage_FIFSupportsReading(eFormat))
+        {
+            pBitmap = FreeImage_Load(eFormat, rFileName.c_str());
+        }
+
+        NPP_ASSERT(pBitmap != 0);
+        // make sure this is an 8-bit single channel image
+        NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK);
+        NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8);
+
+        // create an ImageCPU to receive the loaded image data
+        ImageCPU_8u_C1 oImage(FreeImage_GetWidth(pBitmap), FreeImage_GetHeight(pBitmap));
+
+        // Copy the FreeImage data into the new ImageCPU
+        unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap);
+        const Npp8u *pSrcLine = FreeImage_GetBits(pBitmap) + nSrcPitch * (FreeImage_GetHeight(pBitmap) -1);
+        Npp8u *pDstLine = oImage.data();
+        unsigned int nDstPitch = oImage.pitch();
+
+        for (size_t iLine = 0; iLine < oImage.height(); ++iLine)
+        {
+            memcpy(pDstLine, pSrcLine, oImage.width() * sizeof(Npp8u));
+            pSrcLine -= nSrcPitch;
+            pDstLine += nDstPitch;
+        }
+
+        // swap the user given image with our result image, effecively
+        // moving our newly loaded image data into the user provided shell
+        oImage.swap(rImage);
+    }
+
+    // Save an gray-scale image to disk.
+    void
+    saveImage(const std::string &rFileName, const ImageCPU_8u_C1 &rImage)
+    {
+        // create the result image storage using FreeImage so we can easily
+        // save
+        FIBITMAP *pResultBitmap = FreeImage_Allocate(rImage.width(), rImage.height(), 8 /* bits per pixel */);
+        NPP_ASSERT_NOT_NULL(pResultBitmap);
+        unsigned int nDstPitch   = FreeImage_GetPitch(pResultBitmap);
+        Npp8u *pDstLine = FreeImage_GetBits(pResultBitmap) + nDstPitch * (rImage.height()-1);
+        const Npp8u *pSrcLine = rImage.data();
+        unsigned int nSrcPitch = rImage.pitch();
+
+        for (size_t iLine = 0; iLine < rImage.height(); ++iLine)
+        {
+            memcpy(pDstLine, pSrcLine, rImage.width() * sizeof(Npp8u));
+            pSrcLine += nSrcPitch;
+            pDstLine -= nDstPitch;
+        }
+
+        // now save the result image
+        bool bSuccess;
+        bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, rFileName.c_str(), 0) == TRUE;
+        NPP_ASSERT_MSG(bSuccess, "Failed to save result image.");
+    }
+
+    // Load a gray-scale image from disk.
+    void
+    loadImage(const std::string &rFileName, ImageNPP_8u_C1 &rImage)
+    {
+        ImageCPU_8u_C1 oImage;
+        loadImage(rFileName, oImage);
+        ImageNPP_8u_C1 oResult(oImage);
+        rImage.swap(oResult);
+    }
+
+    // Save an gray-scale image to disk.
+    void
+    saveImage(const std::string &rFileName, const ImageNPP_8u_C1 &rImage)
+    {
+        ImageCPU_8u_C1 oHostImage(rImage.size());
+        // copy the device result data
+        rImage.copyTo(oHostImage.data(), oHostImage.pitch());
+        saveImage(rFileName, oHostImage);
+    }
+}
+
+
+#endif // NV_UTIL_NPP_IMAGE_IO_H

+ 171 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagePacked.h

@@ -0,0 +1,171 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_IMAGE_PACKED_H
+#define NV_UTIL_NPP_IMAGE_PACKED_H
+
+#include "Image.h"
+#include "Pixel.h"
+
+namespace npp
+{
+    template<typename D, size_t N, class A>
+    class ImagePacked: public npp::Image
+    {
+        public:
+            typedef npp::Pixel<D, N>    tPixel;
+            typedef D                   tData;
+            static const size_t         gnChannels = N;
+            typedef npp::Image::Size    tSize;
+
+            ImagePacked(): aPixels_(0)
+                , nPitch_(0)
+            {
+                ;
+            }
+
+            ImagePacked(unsigned int nWidth, unsigned int nHeight): Image(nWidth, nHeight)
+                , aPixels_(0)
+                , nPitch_(0)
+            {
+                aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
+            }
+
+            ImagePacked(unsigned int nWidth, unsigned int nHeight, bool bTight): Image(nWidth, nHeight)
+                , aPixels_(0)
+                , nPitch_(0)
+            {
+                aPixels_ = A::Malloc2D(width(), height(), &nPitch_, bTight);
+            }
+
+            ImagePacked(const tSize &rSize): Image(rSize)
+                , aPixels_(0)
+                , nPitch_(0)
+            {
+                aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
+            }
+
+            ImagePacked(const ImagePacked<D, N, A> &rImage): Image(rImage)
+                , aPixels_(0)
+                , nPitch_(rImage.pitch())
+            {
+                aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
+                A::Copy2D(aPixels_, nPitch_, rImage.pixels(), rImage.pitch(), width(), height());
+            }
+
+            virtual
+            ~ImagePacked()
+            {
+                A::Free2D(aPixels_);
+            }
+
+            ImagePacked &
+            operator= (const ImagePacked<D, N, A> &rImage)
+            {
+                // in case of self-assignment
+                if (&rImage == this)
+                {
+                    return *this;
+                }
+
+                A::Free2D(aPixels_);
+                aPixels_ = 0;
+                nPitch_ = 0;
+
+                // assign parent class's data fields (width, height)
+                Image::operator =(rImage);
+
+                aPixels_ = A::Malloc2D(width(), height(), &nPitch_);
+                A::Copy2D(aPixels_, nPitch_, rImage.data(), rImage.pitch(), width(), height());
+
+                return *this;
+            }
+
+            unsigned int
+            pitch()
+            const
+            {
+                return nPitch_;
+            }
+
+            /// Get a pointer to the pixel array.
+            ///     The result pointer can be offset to pixel at position (x, y) and
+            /// even negative offsets are allowed.
+            /// \param nX Horizontal pointer/array offset.
+            /// \param nY Vertical pointer/array offset.
+            /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY).
+            tPixel *
+            pixels(int nX = 0, int nY = 0)
+            {
+                return reinterpret_cast<tPixel *>(reinterpret_cast<unsigned char *>(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D));
+            }
+
+            const
+            tPixel *
+            pixels(int nX = 0, int nY = 0)
+            const
+            {
+                return reinterpret_cast<const tPixel *>(reinterpret_cast<unsigned char *>(aPixels_) + nY * pitch() + nX * gnChannels * sizeof(D));
+            }
+
+            D *
+            data(int nX = 0, int nY = 0)
+            {
+                return reinterpret_cast<D *>(pixels(nX, nY));
+            }
+
+            const
+            D *
+            data(int nX = 0, int nY = 0)
+            const
+            {
+                return reinterpret_cast<const D *>(pixels(nX, nY));
+            }
+
+            void
+            swap(ImagePacked<D, N, A> &rImage)
+            {
+                Image::swap(rImage);
+
+                tData *aTemp   = aPixels_;
+                aPixels_        = rImage.aPixels_;
+                rImage.aPixels_ = aTemp;
+
+                unsigned int nTemp = nPitch_;
+                nPitch_            = rImage.nPitch_;
+                rImage.nPitch_     = nTemp;
+            }
+
+        private:
+            D *aPixels_;
+            unsigned int nPitch_;
+    };
+
+} // npp namespace
+
+
+#endif // NV_IMAGE_IPP_H

+ 121 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesCPU.h

@@ -0,0 +1,121 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NV_UTIL_NPP_IMAGES_CPU_H
+#define NV_UTIL_NPP_IMAGES_CPU_H
+
+#include "ImagePacked.h"
+
+#include "ImageAllocatorsCPU.h"
+#include "Exceptions.h"
+
+#include <npp.h>
+
+
+namespace npp
+{
+
+    template<typename D, unsigned int N, class A>
+    class ImageCPU: public npp::ImagePacked<D, N, A>
+    {
+        public:
+
+            ImageCPU()
+            {
+                ;
+            }
+
+            ImageCPU(unsigned int nWidth, unsigned int nHeight): ImagePacked<D, N, A>(nWidth, nHeight)
+            {
+                ;
+            }
+
+            explicit
+            ImageCPU(const npp::Image::Size &rSize): ImagePacked<D, N, A>(rSize)
+            {
+                ;
+            }
+
+            ImageCPU(const ImageCPU<D, N, A> &rImage): Image(rImage)
+            {
+                ;
+            }
+
+            virtual
+            ~ImageCPU()
+            {
+                ;
+            }
+
+            ImageCPU &
+            operator= (const ImageCPU<D, N, A> &rImage)
+            {
+                ImagePacked<D, N, A>::operator= (rImage);
+
+                return *this;
+            }
+
+            npp::Pixel<D, N> &
+            operator()(unsigned int iX, unsigned int iY)
+            {
+                return *ImagePacked<D, N, A>::pixels(iX, iY);
+            }
+
+            npp::Pixel<D, N>
+            operator()(unsigned int iX, unsigned int iY)
+            const
+            {
+                return *ImagePacked<D, N, A>::pixels(iX, iY);
+            }
+
+    };
+
+
+    typedef ImageCPU<Npp8u,  1, npp::ImageAllocatorCPU<Npp8u,      1>  >   ImageCPU_8u_C1;
+    typedef ImageCPU<Npp8u,  2, npp::ImageAllocatorCPU<Npp8u,      2>  >   ImageCPU_8u_C2;
+    typedef ImageCPU<Npp8u,  3, npp::ImageAllocatorCPU<Npp8u,      3>  >   ImageCPU_8u_C3;
+    typedef ImageCPU<Npp8u,  4, npp::ImageAllocatorCPU<Npp8u,      4>  >   ImageCPU_8u_C4;
+
+    typedef ImageCPU<Npp16u, 1, npp::ImageAllocatorCPU<Npp16u,     1>  >   ImageCPU_16u_C1;
+    typedef ImageCPU<Npp16u, 3, npp::ImageAllocatorCPU<Npp16u,     3>  >   ImageCPU_16u_C3;
+    typedef ImageCPU<Npp16u, 4, npp::ImageAllocatorCPU<Npp16u,     4>  >   ImageCPU_16u_C4;
+
+    typedef ImageCPU<Npp16s, 1, npp::ImageAllocatorCPU<Npp16s,     1>  >   ImageCPU_16s_C1;
+    typedef ImageCPU<Npp16s, 3, npp::ImageAllocatorCPU<Npp16s,     3>  >   ImageCPU_16s_C3;
+    typedef ImageCPU<Npp16s, 4, npp::ImageAllocatorCPU<Npp16s,     4>  >   ImageCPU_16s_C4;
+
+    typedef ImageCPU<Npp32s, 1, npp::ImageAllocatorCPU<Npp32s,     1>  >   ImageCPU_32s_C1;
+    typedef ImageCPU<Npp32s, 3, npp::ImageAllocatorCPU<Npp32s,     3>  >   ImageCPU_32s_C3;
+    typedef ImageCPU<Npp32s, 4, npp::ImageAllocatorCPU<Npp32s,     4>  >   ImageCPU_32s_C4;
+
+    typedef ImageCPU<Npp32f, 1, npp::ImageAllocatorCPU<Npp32f,     1>  >   ImageCPU_32f_C1;
+    typedef ImageCPU<Npp32f, 3, npp::ImageAllocatorCPU<Npp32f,     3>  >   ImageCPU_32f_C3;
+    typedef ImageCPU<Npp32f, 4, npp::ImageAllocatorCPU<Npp32f,     4>  >   ImageCPU_32f_C4;
+
+} // npp namespace
+
+#endif // NV_IMAGE_IPP_H

+ 149 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/ImagesNPP.h

@@ -0,0 +1,149 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_IMAGES_NPP_H
+#define NV_UTIL_NPP_IMAGES_NPP_H
+
+#include "Exceptions.h"
+#include "ImagePacked.h"
+
+#include "ImageAllocatorsNPP.h"
+#include <cuda_runtime.h>
+
+namespace npp
+{
+    // forward declaration
+    template<typename D, unsigned int N, class A> class ImageCPU;
+
+    template<typename D, unsigned int N>
+    class ImageNPP: public npp::ImagePacked<D, N, npp::ImageAllocator<D, N> >
+    {
+        public:
+            ImageNPP()
+            {
+                ;
+            }
+
+            ImageNPP(unsigned int nWidth, unsigned int nHeight, bool bTight = false): ImagePacked<D, N, npp::ImageAllocator<D, N> >(nWidth, nHeight, bTight)
+            {
+                ;
+            }
+
+            ImageNPP(const npp::Image::Size &rSize): ImagePacked<D, N, npp::ImageAllocator<D, N> >(rSize)
+            {
+                ;
+            }
+
+            ImageNPP(const ImageNPP<D, N> &rImage): Image(rImage)
+            {
+                ;
+            }
+
+            template<class X>
+            explicit
+            ImageNPP(const ImageCPU<D, N, X> &rImage, bool bTight = false): ImagePacked<D, N, npp::ImageAllocator<D, N> >(rImage.width(), rImage.height(), bTight)
+            {
+                npp::ImageAllocator<D, N>::HostToDeviceCopy2D(ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
+                                                              rImage.data(),
+                                                              rImage.pitch(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
+            }
+
+            virtual
+            ~ImageNPP()
+            {
+                ;
+            }
+
+            ImageNPP &
+            operator= (const ImageNPP<D, N> &rImage)
+            {
+                ImagePacked<D, N, npp::ImageAllocator<D, N> >::operator= (rImage);
+
+                return *this;
+            }
+
+            void
+            copyTo(D *pData, unsigned int nPitch)
+            const
+            {
+                NPP_ASSERT((ImagePacked<D, N, npp::ImageAllocator<D, N> >::width() * sizeof(npp::Pixel<D, N>) <= nPitch));
+                npp::ImageAllocator<D, N>::DeviceToHostCopy2D(pData,
+                                                              nPitch,
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
+            }
+
+            void
+            copyFrom(D *pData, unsigned int nPitch)
+            {
+                NPP_ASSERT((ImagePacked<D, N, npp::ImageAllocator<D, N> >::width() * sizeof(npp::Pixel<D, N>) <= nPitch));
+                npp::ImageAllocator<D, N>::HostToDeviceCopy2D(ImagePacked<D, N, npp::ImageAllocator<D, N> >::data(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::pitch(),
+                                                              pData,
+                                                              nPitch,
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::width(),
+                                                              ImagePacked<D, N, npp::ImageAllocator<D, N> >::height());
+            }
+    };
+
+    typedef ImageNPP<Npp8u,  1>   ImageNPP_8u_C1;
+    typedef ImageNPP<Npp8u,  2>   ImageNPP_8u_C2;
+    typedef ImageNPP<Npp8u,  3>   ImageNPP_8u_C3;
+    typedef ImageNPP<Npp8u,  4>   ImageNPP_8u_C4;
+
+    typedef ImageNPP<Npp16u, 1>  ImageNPP_16u_C1;
+    typedef ImageNPP<Npp16u, 2>  ImageNPP_16u_C2;
+    typedef ImageNPP<Npp16u, 3>  ImageNPP_16u_C3;
+    typedef ImageNPP<Npp16u, 4>  ImageNPP_16u_C4;
+
+    typedef ImageNPP<Npp16s, 1>  ImageNPP_16s_C1;
+    typedef ImageNPP<Npp16s, 3>  ImageNPP_16s_C3;
+    typedef ImageNPP<Npp16s, 4>  ImageNPP_16s_C4;
+
+    typedef ImageNPP<Npp32s, 1>  ImageNPP_32s_C1;
+    typedef ImageNPP<Npp32s, 3>  ImageNPP_32s_C3;
+    typedef ImageNPP<Npp32s, 4>  ImageNPP_32s_C4;
+
+    typedef ImageNPP<Npp32f, 1>  ImageNPP_32f_C1;
+    typedef ImageNPP<Npp32f, 2>  ImageNPP_32f_C2;
+    typedef ImageNPP<Npp32f, 3>  ImageNPP_32f_C3;
+    typedef ImageNPP<Npp32f, 4>  ImageNPP_32f_C4;
+
+    typedef ImageNPP<Npp64f, 1>  ImageNPP_64f_C1;
+    typedef ImageNPP<Npp64f, 2>  ImageNPP_64f_C2;
+    typedef ImageNPP<Npp64f, 3>  ImageNPP_64f_C3;
+    typedef ImageNPP<Npp64f, 4>  ImageNPP_64f_C4;
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_IMAGES_NPP_H

+ 126 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Pixel.h

@@ -0,0 +1,126 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_PIXEL_H
+#define NV_UTIL_PIXEL_H
+
+#include "Exceptions.h"
+
+namespace npp
+{
+    template <typename D, size_t N>
+    struct Pixel
+    { };
+
+    template <typename D>
+    struct Pixel<D, 1>
+    {
+        D x;
+
+        const D &
+        operator[](size_t iChannel)
+        const
+        {
+            NPP_ASSERT(iChannel < 1);
+            return (&x)[iChannel];
+        }
+
+        D &
+        operator[](size_t iChannel)
+        {
+            NPP_ASSERT(iChannel < 1);
+            return (&x)[iChannel];
+        }
+    };
+
+    template <typename D>
+    struct Pixel<D, 2>
+    {
+        D x,y;
+
+        const D &
+        operator[](size_t iChannel)
+        const
+        {
+            NPP_ASSERT(iChannel < 2);
+            return (&x)[iChannel];
+        }
+
+        D &
+        operator[](size_t iChannel)
+        {
+            NPP_ASSERT(iChannel < 2);
+            return (&x)[iChannel];
+        }
+    };
+
+    template <typename D>
+    struct Pixel<D, 3>
+    {
+        D x,y,z;
+
+        const D &
+        operator[](size_t iChannel)
+        const
+        {
+            NPP_ASSERT(iChannel < 3);
+            return (&x)[iChannel];
+        }
+
+        D &
+        operator[](size_t iChannel)
+        {
+            NPP_ASSERT(iChannel < 3);
+            return (&x)[iChannel];
+        }
+    };
+
+    template <typename D>
+    struct Pixel<D, 4>
+    {
+        D x, y, z, w;
+
+        const D &
+        operator[](size_t iChannel)
+        const
+        {
+            NPP_ASSERT(iChannel < 4);
+            return (&x)[iChannel];
+        }
+
+        D &
+        operator[](size_t iChannel)
+        {
+            NPP_ASSERT(iChannel < 4);
+            return (&x)[iChannel];
+        }
+    };
+
+} // npp namespace
+
+#endif // NV_UTIL_PIXEL_H

+ 168 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/Signal.h

@@ -0,0 +1,168 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_SIGNAL_H
+#define NV_UTIL_NPP_SIGNAL_H
+
+#include <cstring>
+
+namespace npp
+{
+    class Signal
+    {
+        public:
+            Signal() : nSize_(0)
+            { };
+
+            explicit
+            Signal(size_t nSize) : nSize_(nSize)
+            { };
+
+            Signal(const Signal &rSignal) : nSize_(rSignal.nSize_)
+            { };
+
+            virtual
+            ~Signal()
+            { }
+
+            Signal &
+            operator= (const Signal &rSignal)
+            {
+                nSize_ = rSignal.nSize_;
+                return *this;
+            }
+
+            size_t
+            size()
+            const
+            {
+                return nSize_;
+            }
+
+            void
+            swap(Signal &rSignal)
+            {
+                size_t nTemp = nSize_;
+                nSize_ = rSignal.nSize_;
+                rSignal.nSize_ = nTemp;
+            }
+
+
+        private:
+            size_t nSize_;
+    };
+
+    template<typename D, class A>
+    class SignalTemplate: public Signal
+    {
+        public:
+            typedef D tData;
+
+            SignalTemplate(): aValues_(0)
+            {
+                ;
+            }
+
+            SignalTemplate(size_t nSize): Signal(nSize)
+                , aValues_(0)
+            {
+                aValues_ = A::Malloc1D(size());
+            }
+
+            SignalTemplate(const SignalTemplate<D, A> &rSignal): Signal(rSignal)
+                , aValues_(0)
+            {
+                aValues_ = A::Malloc1D(size());
+                A::Copy1D(aValues_, rSignal.values(), size());
+            }
+
+            virtual
+            ~SignalTemplate()
+            {
+                A::Free1D(aValues_);
+            }
+
+            SignalTemplate &
+            operator= (const SignalTemplate<D, A> &rSignal)
+            {
+                // in case of self-assignment
+                if (&rSignal == this)
+                {
+                    return *this;
+                }
+
+                A::Free1D(aValues_);
+                this->aPixels_ = 0;
+
+                // assign parent class's data fields (width, height)
+                Signal::operator =(rSignal);
+
+                aValues_ = A::Malloc1D(size());
+                A::Copy1D(aValues_, rSignal.value(), size());
+
+                return *this;
+            }
+
+            /// Get a pointer to the pixel array.
+            ///     The result pointer can be offset to pixel at position (x, y) and
+            /// even negative offsets are allowed.
+            /// \param nX Horizontal pointer/array offset.
+            /// \param nY Vertical pointer/array offset.
+            /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY).
+            tData *
+            values(int i = 0)
+            {
+                return aValues_ + i;
+            }
+
+            const
+            tData *
+            values(int i = 0)
+            const
+            {
+                return aValues_ + i;
+            }
+
+            void
+            swap(SignalTemplate<D, A> &rSignal)
+            {
+                Signal::swap(rSignal);
+
+                tData *aTemp       = this->aValues_;
+                this->aValues_      = rSignal.aValues_;
+                rSignal.aValues_    = aTemp;
+            }
+
+        private:
+            D *aValues_;
+    };
+
+} // npp namespace
+
+
+#endif // NV_UTIL_NPP_SIGNAL_H

+ 66 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsCPU.h

@@ -0,0 +1,66 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
+#define NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
+
+#include "Exceptions.h"
+
+namespace npp
+{
+
+    template <typename D>
+    class SignalAllocatorCPU
+    {
+        public:
+            static
+            D *
+            Malloc1D(unsigned int nSize)
+            {
+                return new D[nSize];;
+            };
+
+            static
+            void
+            Free1D(D *pPixels)
+            {
+                delete[] pPixels;
+            };
+
+            static
+            void
+            Copy1D(D *pDst, const D *pSrc, size_t nSize)
+            {
+                memcpy(pDst, pSrc, nSize * sizeof(D));
+            };
+
+    };
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H

+ 684 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalAllocatorsNPP.h

@@ -0,0 +1,684 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_NPP_H
+#define NV_UTIL_NPP_SIGNAL_ALLOCATORS_NPP_H
+
+
+#include "Exceptions.h"
+
+#include <npps.h>
+#include <cuda_runtime.h>
+
+namespace npp
+{
+
+    template <typename D>
+    class SignalAllocator
+    {
+    };
+
+    template<>
+    class SignalAllocator<Npp8u>
+    {
+        public:
+            static
+            Npp8u *
+            Malloc1D(size_t nSize)
+            {
+                Npp8u *pResult = nppsMalloc_8u(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp8u *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp8u *pDst, const Npp8u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp8u),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp8u *pDst, const Npp8u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp8u), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp8u *pDst, const Npp8u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp16s>
+    {
+        public:
+            static
+            Npp16s *
+            Malloc1D(size_t nSize)
+            {
+                Npp16s *pResult = nppsMalloc_16s(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp16s *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp16s *pDst, const Npp16s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16s),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp16s *pDst, const Npp16s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16s), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp16s *pDst, const Npp16s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16s), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp16u>
+    {
+        public:
+            static
+            Npp16u *
+            Malloc1D(size_t nSize)
+            {
+                Npp16u *pResult = nppsMalloc_16u(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp16u *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp16u *pDst, const Npp16u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16u),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp16u *pDst, const Npp16u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16u), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp16u *pDst, const Npp16u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16u), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp16sc>
+    {
+        public:
+            static
+            Npp16sc *
+            Malloc1D(size_t nSize)
+            {
+                Npp16sc *pResult = nppsMalloc_16sc(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp16sc *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp16sc *pDst, const Npp16sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16sc),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp16sc *pDst, const Npp16sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16sc), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp16sc *pDst, const Npp16sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp16sc), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp32u>
+    {
+        public:
+            static
+            Npp32u *
+            Malloc1D(size_t nSize)
+            {
+                Npp32u *pResult = nppsMalloc_32u(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp32u *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp32u *pDst, const Npp32u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32u),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp32u *pDst, const Npp32u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32u), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp32u *pDst, const Npp32u *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32u), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp32s>
+    {
+        public:
+            static
+            Npp32s *
+            Malloc1D(size_t nSize)
+            {
+                Npp32s *pResult = nppsMalloc_32s(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp32s *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp32s *pDst, const Npp32s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32s),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp32s *pDst, const Npp32s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32s), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp32s *pDst, const Npp32s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32s), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp32sc>
+    {
+        public:
+            static
+            Npp32sc *
+            Malloc1D(size_t nSize)
+            {
+                Npp32sc *pResult = nppsMalloc_32sc(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp32sc *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp32sc *pDst, const Npp32sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32sc),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp32sc *pDst, const Npp32sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32sc), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp32sc *pDst, const Npp32sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32sc), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp32f>
+    {
+        public:
+            static
+            Npp32f *
+            Malloc1D(size_t nSize)
+            {
+                Npp32f *pResult = nppsMalloc_32f(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp32f *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp32f *pDst, const Npp32f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32f),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp32f *pDst, const Npp32f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32f), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp32f *pDst, const Npp32f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32f), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp32fc>
+    {
+        public:
+            static
+            Npp32fc *
+            Malloc1D(size_t nSize)
+            {
+                Npp32fc *pResult = nppsMalloc_32fc(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp32fc *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp32fc *pDst, const Npp32fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32fc),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp32fc *pDst, const Npp32fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32fc), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp32fc *pDst, const Npp32fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp32fc), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp64s>
+    {
+        public:
+            static
+            Npp64s *
+            Malloc1D(size_t nSize)
+            {
+                Npp64s *pResult = nppsMalloc_64s(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp64s *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp64s *pDst, const Npp64s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64s),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp64s *pDst, const Npp64s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64s), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp64s *pDst, const Npp64s *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64s), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp64sc>
+    {
+        public:
+            static
+            Npp64sc *
+            Malloc1D(size_t nSize)
+            {
+                Npp64sc *pResult = nppsMalloc_64sc(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp64sc *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp64sc *pDst, const Npp64sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64sc),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp64sc *pDst, const Npp64sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64sc), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp64sc *pDst, const Npp64sc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64sc), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp64f>
+    {
+        public:
+            static
+            Npp64f *
+            Malloc1D(size_t nSize)
+            {
+                Npp64f *pResult = nppsMalloc_64f(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp64f *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp64f *pDst, const Npp64f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64f),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp64f *pDst, const Npp64f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64f), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp64f *pDst, const Npp64f *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64f), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+
+    template<>
+    class SignalAllocator<Npp64fc>
+    {
+        public:
+            static
+            Npp64fc *
+            Malloc1D(size_t nSize)
+            {
+                Npp64fc *pResult = nppsMalloc_64fc(static_cast<int>(nSize));
+                NPP_ASSERT(pResult != 0);
+
+                return pResult;
+            };
+
+            static
+            void
+            Free1D(Npp64fc *pValues)
+            {
+                nppsFree(pValues);
+            };
+
+            static
+            void
+            Copy1D(Npp64fc *pDst, const Npp64fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64fc),cudaMemcpyDeviceToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            HostToDeviceCopy1D(Npp64fc *pDst, const Npp64fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64fc), cudaMemcpyHostToDevice);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+
+            static
+            void
+            DeviceToHostCopy1D(Npp64fc *pDst, const Npp64fc *pSrc, size_t nSize)
+            {
+                cudaError_t eResult;
+                eResult = cudaMemcpy(pDst, pSrc, nSize * sizeof(Npp64fc), cudaMemcpyDeviceToHost);
+                NPP_ASSERT(cudaSuccess == eResult);
+            };
+    };
+} // npp namespace
+
+#endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_NPP_H

+ 107 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsCPU.h

@@ -0,0 +1,107 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_SIGNALS_CPU_H
+#define NV_UTIL_NPP_SIGNALS_CPU_H
+
+#include "Signal.h"
+
+#include "SignalAllocatorsCPU.h"
+#include "Exceptions.h"
+
+#include <npp.h>
+
+
+namespace npp
+{
+
+    template<typename D, class A>
+    class SignalCPU: public npp::SignalTemplate<D, A>
+    {
+        public:
+            typedef typename npp::SignalTemplate<D, A>::tData tData;
+
+            SignalCPU()
+            {
+                ;
+            }
+
+            SignalCPU(size_t nSize): SignalTemplate<D, A>(nSize)
+            {
+                ;
+            }
+
+            SignalCPU(const SignalCPU<D, A> &rSignal): SignalTemplate<D, A>(rSignal)
+            {
+                ;
+            }
+
+            virtual
+            ~SignalCPU()
+            {
+                ;
+            }
+
+            SignalCPU &
+            operator= (const SignalCPU<D,A> &rSignal)
+            {
+                SignalTemplate<D, A>::operator= (rSignal);
+
+                return *this;
+            }
+
+            tData &
+            operator [](unsigned int i)
+            {
+                return *SignalTemplate<D, A>::values(i);
+            }
+
+            tData
+            operator [](unsigned int i)
+            const
+            {
+                return *SignalTemplate<D, A>::values(i);
+            }
+
+    };
+
+    typedef SignalCPU<Npp8u,   npp::SignalAllocatorCPU<Npp8u>   >   SignalCPU_8u;
+    typedef SignalCPU<Npp32s,  npp::SignalAllocatorCPU<Npp32s>  >   SignalCPU_32s;
+    typedef SignalCPU<Npp16s,  npp::SignalAllocatorCPU<Npp16s>  >   SignalCPU_16s;
+    typedef SignalCPU<Npp16sc, npp::SignalAllocatorCPU<Npp16sc> >   SignalCPU_16sc;
+    typedef SignalCPU<Npp32sc, npp::SignalAllocatorCPU<Npp32sc> >   SignalCPU_32sc;
+    typedef SignalCPU<Npp32f,  npp::SignalAllocatorCPU<Npp32f>  >   SignalCPU_32f;
+    typedef SignalCPU<Npp32fc, npp::SignalAllocatorCPU<Npp32fc> >   SignalCPU_32fc;
+    typedef SignalCPU<Npp64s,  npp::SignalAllocatorCPU<Npp64s>  >   SignalCPU_64s;
+    typedef SignalCPU<Npp64sc, npp::SignalAllocatorCPU<Npp64sc> >   SignalCPU_64sc;
+    typedef SignalCPU<Npp64f,  npp::SignalAllocatorCPU<Npp64f>  >   SignalCPU_64f;
+    typedef SignalCPU<Npp64fc, npp::SignalAllocatorCPU<Npp64fc> >   SignalCPU_64fc;
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_SIGNALS_CPU_H

+ 113 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/UtilNPP/SignalsNPP.h

@@ -0,0 +1,113 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef NV_UTIL_NPP_SIGNALS_NPP_H
+#define NV_UTIL_NPP_SIGNALS_NPP_H
+
+#include "Exceptions.h"
+#include "Signal.h"
+
+#include "SignalAllocatorsNPP.h"
+#include <cuda_runtime.h>
+
+namespace npp
+{
+    // forward declaration
+    template<typename D, class A> class SignalCPU;
+
+    template<typename D>
+    class SignalNPP: public npp::SignalTemplate<D, npp::SignalAllocator<D> >
+    {
+        public:
+            SignalNPP()
+            {
+                ;
+            }
+
+            explicit
+            SignalNPP(size_t nSize): SignalTemplate<D, npp::SignalAllocator<D> >(nSize)
+            {
+                ;
+            }
+
+            SignalNPP(const SignalNPP<D> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal)
+            {
+                ;
+            }
+
+            template<class X>
+            explicit
+            SignalNPP(const SignalCPU<D, X> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal.size())
+            {
+                npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(),
+                                                            rSignal.values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
+            }
+
+            virtual
+            ~SignalNPP()
+            {
+                ;
+            }
+
+            SignalNPP &
+            operator= (const SignalNPP<D> &rSignal)
+            {
+                SignalTemplate<D, npp::SignalAllocator<D> >::operator= (rSignal);
+
+                return *this;
+            }
+
+            void
+            copyTo(D *pValues)
+            const
+            {
+                npp::SignalAllocator<D>::DeviceToHostCopy1D(pValues, SignalTemplate<D, npp::SignalAllocator<D> >::values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
+            }
+
+            void
+            copyFrom(D *pValues)
+            {
+                npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(), pValues, SignalTemplate<D, npp::SignalAllocator<D> >::size());
+            }
+    };
+
+    typedef SignalNPP<Npp8u>    SignalNPP_8u;
+    typedef SignalNPP<Npp16s>   SignalNPP_16s;
+    typedef SignalNPP<Npp16sc>  SignalNPP_16sc;
+    typedef SignalNPP<Npp32s>   SignalNPP_32s;
+    typedef SignalNPP<Npp32sc>  SignalNPP_32sc;
+    typedef SignalNPP<Npp32f>   SignalNPP_32f;
+    typedef SignalNPP<Npp32fc>  SignalNPP_32fc;
+    typedef SignalNPP<Npp64s>   SignalNPP_64s;
+    typedef SignalNPP<Npp64sc>  SignalNPP_64sc;
+    typedef SignalNPP<Npp64f>   SignalNPP_64f;
+    typedef SignalNPP<Npp64fc>  SignalNPP_64fc;
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_SIGNALS_NPP_H

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/CT_skull_512x512_8u_Gray.raw


File diff suppressed because it is too large
+ 1 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Lena_512x512_8u_Gray.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB2_1024x683_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_1280x720_8u.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/PCB_METAL_509x335_8u.raw


File diff suppressed because it is too large
+ 285 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/Rocks_512x512_8u_Gray.raw


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/data/lena_512x512_8u.raw


+ 470 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/drvapi_error_string.h

@@ -0,0 +1,470 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_DRVAPI_ERROR_STRING_H_
+#define COMMON_DRVAPI_ERROR_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cuda_cuda_h__  // check to see if CUDA_H is included above
+
+// Error Code string definitions here
+typedef struct {
+  char const *error_string;
+  int error_id;
+} s_CudaErrorStr;
+
+/**
+ * Error codes
+ */
+static s_CudaErrorStr sCudaDrvErrorString[] = {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * can also mean that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    {"CUDA_SUCCESS", 0},
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    {"CUDA_ERROR_INVALID_VALUE", 1},
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    {"CUDA_ERROR_OUT_OF_MEMORY", 2},
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    {"CUDA_ERROR_NOT_INITIALIZED", 3},
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    {"CUDA_ERROR_DEINITIALIZED", 4},
+
+    /**
+     * This indicates profiling APIs are called while application is running
+     * in visual profiler mode.
+     */
+    {"CUDA_ERROR_PROFILER_DISABLED", 5},
+    /**
+     * This indicates profiling has not been initialized for this context.
+     * Call cuProfilerInitialize() to resolve this.
+     */
+    {"CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6},
+    /**
+     * This indicates profiler has already been started and probably
+     * cuProfilerStart() is incorrectly called.
+     */
+    {"CUDA_ERROR_PROFILER_ALREADY_STARTED", 7},
+    /**
+     * This indicates profiler has already been stopped and probably
+     * cuProfilerStop() is incorrectly called.
+     */
+    {"CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8},
+    /**
+     * This indicates that no CUDA-capable devices were detected by the
+     * installed CUDA driver.
+     */
+    {"CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100},
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    {"CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)",
+     101},
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    {"CUDA_ERROR_INVALID_IMAGE", 200},
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    {"CUDA_ERROR_INVALID_CONTEXT", 201},
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    {"CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202},
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    {"CUDA_ERROR_MAP_FAILED", 205},
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    {"CUDA_ERROR_UNMAP_FAILED", 206},
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    {"CUDA_ERROR_ARRAY_IS_MAPPED", 207},
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    {"CUDA_ERROR_ALREADY_MAPPED", 208},
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    {"CUDA_ERROR_NO_BINARY_FOR_GPU", 209},
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    {"CUDA_ERROR_ALREADY_ACQUIRED", 210},
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    {"CUDA_ERROR_NOT_MAPPED", 211},
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    {"CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212},
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    {"CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213},
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    {"CUDA_ERROR_ECC_UNCORRECTABLE", 214},
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    {"CUDA_ERROR_UNSUPPORTED_LIMIT", 215},
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    {"CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216},
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    {"CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    {"CUDA_ERROR_INVALID_PTX", 218},
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    {"CUDA_ERROR_INVALID_GRAPHICS_CONTEXT", 219},
+
+    /**
+     * This indicates that an uncorrectable NVLink error was detected during the
+     * execution.
+     */
+    {"CUDA_ERROR_NVLINK_UNCORRECTABLE", 220},
+
+    /**
+     * This indicates that the PTX JIT compiler library was not found.
+     */
+    {"CUDA_ERROR_JIT_COMPILER_NOT_FOUND", 221},
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    {"CUDA_ERROR_INVALID_SOURCE", 300},
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    {"CUDA_ERROR_FILE_NOT_FOUND", 301},
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    {"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302},
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    {"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303},
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    {"CUDA_ERROR_OPERATING_SYSTEM", 304},
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    {"CUDA_ERROR_INVALID_HANDLE", 400},
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names }, and surface names.
+     */
+    {"CUDA_ERROR_NOT_FOUND", 500},
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be
+     * indicated differently than ::CUDA_SUCCESS (which indicates completion).
+     * Calls that may return this value include ::cuEventQuery() and
+     * ::cuStreamQuery().
+     */
+    {"CUDA_ERROR_NOT_READY", 600},
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    {"CUDA_ERROR_ILLEGAL_ADDRESS", 700},
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    {"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701},
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
+     * context cannot be used (and must be destroyed similar to
+     * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
+     * this context are invalid and must be reconstructed if the program is to
+     * continue using CUDA.
+     */
+    {"CUDA_ERROR_LAUNCH_TIMEOUT", 702},
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    {"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703},
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    {"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704},
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    {"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705},
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    {"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708},
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy }, or is a primary context which
+     * has not yet been initialized.
+     */
+    {"CUDA_ERROR_CONTEXT_IS_DESTROYED", 709},
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    {"CUDA_ERROR_ASSERT", 710},
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    {"CUDA_ERROR_TOO_MANY_PEERS", 711},
+
+    /**
+     * This error indicates that the memory range passed to
+     * ::cuMemHostRegister() has already been registered.
+     */
+    {"CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712},
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    {"CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713},
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    {"CUDA_ERROR_HARDWARE_STACK_ERROR", 714},
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    {"CUDA_ERROR_ILLEGAL_INSTRUCTION", 715},
+
+    /**
+     * While executing a kernel, the device encountered a load or store
+     * instruction on a memory address which is not aligned. This leaves the
+     * process in an inconsistent state and any further CUDA work will return
+     * the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    {"CUDA_ERROR_MISALIGNED_ADDRESS", 716},
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    {"CUDA_ERROR_INVALID_ADDRESS_SPACE", 717},
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address
+     * space. This leaves the process in an inconsistent state and any further
+     * CUDA work will return the same error. To continue using CUDA, the process
+     * must be terminated and relaunched.
+     */
+    {"CUDA_ERROR_INVALID_PC", 718},
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. The context cannot be used }, so it must
+     * be destroyed (and a new one should be created). All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    {"CUDA_ERROR_LAUNCH_FAILED", 719},
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a
+     * kernel that was launched via either ::cuLaunchCooperativeKernel or
+     * ::cuLaunchCooperativeKernelMultiDevice exceeds the maximum number of
+     * blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor or
+     * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number
+     * of multiprocessors as specified by the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", 720},
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    {"CUDA_ERROR_NOT_PERMITTED", 800},
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    {"CUDA_ERROR_NOT_SUPPORTED", 801},
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    {"CUDA_ERROR_UNKNOWN", 999},
+    {NULL, -1}};
+
+// This is just a linear search through the array, since the error_id's are not
+// always ocurring consecutively
+inline const char *getCudaDrvErrorString(CUresult error_id) {
+  int index = 0;
+
+  while (sCudaDrvErrorString[index].error_id != error_id &&
+         sCudaDrvErrorString[index].error_id != -1) {
+    index++;
+  }
+
+  if (sCudaDrvErrorString[index].error_id == error_id)
+    return (const char *)sCudaDrvErrorString[index].error_string;
+  else
+    return (const char *)"CUDA_ERROR not found!";
+}
+
+#endif  // __cuda_cuda_h__
+
+#endif  //  COMMON_DRVAPI_ERROR_STRING_H_

+ 160 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/dynlink_d3d11.h

@@ -0,0 +1,160 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//--------------------------------------------------------------------------------------
+// File: dynlink_d3d11.h
+//
+// Shortcut macros and functions for using DX objects
+//
+// Copyright (c) Microsoft Corporation. All rights reserved
+//--------------------------------------------------------------------------------------
+
+#ifndef _DYNLINK_D3D11_H_
+#define _DYNLINK_D3D11_H_
+
+// Standard Windows includes
+#include <windows.h>
+#include <initguid.h>
+#include <assert.h>
+#include <wchar.h>
+#include <mmsystem.h>
+#include <commctrl.h> // for InitCommonControls() 
+#include <shellapi.h> // for ExtractIcon()
+#include <new.h>      // for placement new
+#include <shlobj.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+// CRT's memory leak detection
+#if defined(DEBUG) || defined(_DEBUG)
+#include <crtdbg.h>
+#endif
+
+// Direct3D10 includes
+#include <dxgi.h>
+#include <d3d11.h>
+// #include <..\Samples\C++\Effects11\Inc\d3dx11effect.h>
+
+// XInput includes
+#include <xinput.h>
+
+// strsafe.h deprecates old unsecure string functions.  If you
+// really do not want to it to (not recommended), then uncomment the next line
+//#define STRSAFE_NO_DEPRECATE
+
+#ifndef STRSAFE_NO_DEPRECATE
+#pragma deprecated("strncpy")
+#pragma deprecated("wcsncpy")
+#pragma deprecated("_tcsncpy")
+#pragma deprecated("wcsncat")
+#pragma deprecated("strncat")
+#pragma deprecated("_tcsncat")
+#endif
+
+#pragma warning( disable : 4996 ) // disable deprecated warning 
+#include <strsafe.h>
+#pragma warning( default : 4996 )
+
+typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
+typedef HRESULT(WINAPI *LPD3D11CREATEDEVICEANDSWAPCHAIN)(__in_opt IDXGIAdapter *pAdapter, D3D_DRIVER_TYPE DriverType, HMODULE Software, UINT Flags, __in_ecount_opt(FeatureLevels) CONST D3D_FEATURE_LEVEL *pFeatureLevels, UINT FeatureLevels, UINT SDKVersion, __in_opt CONST DXGI_SWAP_CHAIN_DESC *pSwapChainDesc, __out_opt IDXGISwapChain **ppSwapChain, __out_opt ID3D11Device **ppDevice, __out_opt D3D_FEATURE_LEVEL *pFeatureLevel, __out_opt ID3D11DeviceContext **ppImmediateContext);
+typedef HRESULT(WINAPI *LPD3D11CREATEDEVICE)(IDXGIAdapter *, D3D_DRIVER_TYPE, HMODULE, UINT32, D3D_FEATURE_LEVEL *, UINT, UINT32, ID3D11Device **, D3D_FEATURE_LEVEL *, ID3D11DeviceContext **);
+
+static HMODULE                              s_hModDXGI = NULL;
+static LPCREATEDXGIFACTORY                  sFnPtr_CreateDXGIFactory = NULL;
+static HMODULE                              s_hModD3D11 = NULL;
+static LPD3D11CREATEDEVICE                  sFnPtr_D3D11CreateDevice = NULL;
+static LPD3D11CREATEDEVICEANDSWAPCHAIN      sFnPtr_D3D11CreateDeviceAndSwapChain = NULL;
+
+// unload the D3D10 DLLs
+static bool dynlinkUnloadD3D11API(void)
+{
+    if (s_hModDXGI)
+    {
+        FreeLibrary(s_hModDXGI);
+        s_hModDXGI = NULL;
+    }
+
+    if (s_hModD3D11)
+    {
+        FreeLibrary(s_hModD3D11);
+        s_hModD3D11 = NULL;
+    }
+
+    return true;
+}
+
+// Dynamically load the D3D11 DLLs loaded and map the function pointers
+static bool dynlinkLoadD3D11API(void)
+{
+    // If both modules are non-NULL, this function has already been called.  Note
+    // that this doesn't guarantee that all ProcAddresses were found.
+    if (s_hModD3D11 != NULL && s_hModDXGI != NULL)
+    {
+        return true;
+    }
+
+#if 1
+    // This may fail if Direct3D 11 isn't installed
+    s_hModD3D11 = LoadLibrary("d3d11.dll");
+
+    if (s_hModD3D11 != NULL)
+    {
+        sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)GetProcAddress(s_hModD3D11, "D3D11CreateDevice");
+        sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)GetProcAddress(s_hModD3D11, "D3D11CreateDeviceAndSwapChain");
+    }
+    else
+    {
+        printf("\nLoad d3d11.dll failed\n");
+        fflush(0);
+    }
+
+    if (!sFnPtr_CreateDXGIFactory)
+    {
+        s_hModDXGI = LoadLibrary("dxgi.dll");
+
+        if (s_hModDXGI)
+        {
+            sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)GetProcAddress(s_hModDXGI, "CreateDXGIFactory1");
+        }
+
+        return (s_hModDXGI != NULL) && (s_hModD3D11 != NULL);
+    }
+
+    return (s_hModD3D11 != NULL);
+#else
+    sFnPtr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)D3D11CreateDeviceAndSwapChain;
+    sFnPtr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)D3D11CreateDeviceAndSwapChain;
+    //sFnPtr_D3DX11CreateEffectFromMemory  = ( LPD3DX11CREATEEFFECTFROMMEMORY )D3DX11CreateEffectFromMemory;
+    sFnPtr_D3DX11CompileFromMemory = (LPD3DX11COMPILEFROMMEMORY)D3DX11CompileFromMemory;
+    sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY)CreateDXGIFactory;
+    return true;
+#endif
+    return true;
+}
+
+#endif

+ 151 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/exception.h

@@ -0,0 +1,151 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_

+ 967 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cuda.h

@@ -0,0 +1,967 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_

+ 405 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cuda_drvapi.h

@@ -0,0 +1,405 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is
+// included in your projects)
+#ifndef COMMON_HELPER_CUDA_DRVAPI_H_
+#define COMMON_HELPER_CUDA_DRVAPI_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <cstring>
+#include <sstream>
+
+#include <helper_string.h>
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+#ifndef COMMON_HELPER_CUDA_H_
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions
+
+// add a level of protection to the CUDA SDK samples, let's force samples to
+// explicitly include CUDA.H
+#ifdef __cuda_cuda_h__
+// This will output the proper CUDA error strings in the event that a CUDA host
+// call returns an error
+#ifndef checkCudaErrors
+#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
+
+// These are the inline versions for all of the SDK helper functions
+inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
+  if (CUDA_SUCCESS != err) {
+    const char *errorStr = NULL;
+    cuGetErrorString(err, &errorStr);
+    fprintf(stderr,
+            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
+            "line %i.\n",
+            err, errorStr, file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+#endif
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+                             int device) {
+  checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
+}
+#endif
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2CoresDRV(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine the #
+  // of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
+             // minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one to run
+  // properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __cuda_cuda_h__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
+  int cuDevice = 0;
+  int deviceCount = 0;
+  checkCudaErrors(cuInit(0));
+
+  checkCudaErrors(cuDeviceGetCount(&deviceCount));
+
+  if (deviceCount == 0) {
+    fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int dev = 0;
+  dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
+
+  if (dev < 0) {
+    dev = 0;
+  }
+
+  if (dev > deviceCount - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            deviceCount);
+    fprintf(stderr,
+            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
+            dev);
+    fprintf(stderr, "\n");
+    return -dev;
+  }
+
+  checkCudaErrors(cuDeviceGet(&cuDevice, dev));
+  char name[100];
+  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+
+  int computeMode;
+  getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+
+  if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
+    fprintf(stderr,
+            "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
+            "threads can use this CUDA Device.\n");
+    return -1;
+  }
+
+  if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
+    printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
+  }
+
+  return dev;
+}
+
+// This function returns the best GPU based on performance
+inline int gpuGetMaxGflopsDeviceIdDRV() {
+  CUdevice current_device = 0;
+  CUdevice max_perf_device = 0;
+  int device_count = 0;
+  int sm_per_multiproc = 0;
+  unsigned long long max_compute_perf = 0;
+  int major = 0;
+  int minor = 0;
+  int multiProcessorCount;
+  int clockRate;
+  int devices_prohibited = 0;
+
+  cuInit(0);
+  checkCudaErrors(cuDeviceGetCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    checkCudaErrors(cuDeviceGetAttribute(
+        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+        current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+
+    int computeMode;
+    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
+                          current_device);
+
+    if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+      }
+
+      unsigned long long compute_perf =
+          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
+                               clockRate);
+
+      if (compute_perf > max_compute_perf) {
+          max_compute_perf = compute_perf;
+          max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
+            "prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// General initialization call to pick the best CUDA Device
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
+  CUdevice cuDevice;
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
+    devID = gpuDeviceInitDRV(argc, argv);
+
+    if (devID < 0) {
+      printf("exiting...\n");
+      exit(EXIT_SUCCESS);
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    char name[100];
+    devID = gpuGetMaxGflopsDeviceIdDRV();
+    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+    cuDeviceGetName(name, 100, cuDevice);
+    printf("> Using CUDA Device [%d]: %s\n", devID, name);
+  }
+
+  cuDeviceGet(&cuDevice, devID);
+
+  return cuDevice;
+}
+
+inline CUdevice findIntegratedGPUDrv() {
+  CUdevice current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+  int isIntegrated;
+
+  cuInit(0);
+  checkCudaErrors(cuDeviceGetCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1;
+    checkCudaErrors(cuDeviceGetAttribute(
+        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
+
+    // If GPU is integrated and is not running on Compute Mode prohibited use
+    // that
+    if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
+      int major = 0, minor = 0;
+      char deviceName[256];
+      checkCudaErrors(cuDeviceGetAttribute(
+          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+          current_device));
+      checkCudaErrors(cuDeviceGetAttribute(
+          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+          current_device));
+      checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, deviceName, major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
+                                     int devID) {
+  CUdevice cuDevice;
+  char name[256];
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+
+  if ((major > major_version) ||
+      (major == major_version && minor >= minor_version)) {
+    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
+           major, minor);
+    return true;
+  } else {
+    printf(
+        "No GPU device was found that can support CUDA compute capability "
+        "%d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+bool inline findFatbinPath(const char *module_file, std::string &module_path, char **argv, std::ostringstream &ostrm)
+{
+    char *actual_path = sdkFindFilePath(module_file, argv[0]);
+
+    if (actual_path)
+    {
+        module_path = actual_path;
+    }
+    else
+    {
+        printf("> findModulePath file not found: <%s> \n", module_file);
+        return false;
+    }
+
+    if (module_path.empty())
+    {
+        printf("> findModulePath could not find file: <%s> \n", module_file);
+        return false;
+    }
+    else
+    {
+        printf("> findModulePath found file at <%s>\n", module_path.c_str());
+        if (module_path.rfind("fatbin") != std::string::npos)
+        {
+            std::ifstream fileIn(module_path.c_str(), std::ios::binary);
+            ostrm << fileIn.rdbuf();
+            fileIn.close();
+        }
+        return true;
+    }
+}
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_DRVAPI_H_
+

+ 166 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_cusolver.h

@@ -0,0 +1,166 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HELPER_CUSOLVER
+#define HELPER_CUSOLVER
+
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cusparse.h"
+
+#define SWITCH_CHAR '-'
+
+struct testOpts {
+  char *sparse_mat_filename;  // by switch -F<filename>
+  const char *testFunc;       // by switch -R<name>
+  const char *reorder;        // by switch -P<name>
+  int lda;                    // by switch -lda<int>
+};
+
+double vec_norminf(int n, const double *x) {
+  double norminf = 0;
+  for (int j = 0; j < n; j++) {
+    double x_abs = fabs(x[j]);
+    norminf = (norminf > x_abs) ? norminf : x_abs;
+  }
+  return norminf;
+}
+
+/*
+ * |A| = max { |A|*ones(m,1) }
+ */
+double mat_norminf(int m, int n, const double *A, int lda) {
+  double norminf = 0;
+  for (int i = 0; i < m; i++) {
+    double sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      double A_abs = fabs(A[i + j * lda]);
+      sum += A_abs;
+    }
+    norminf = (norminf > sum) ? norminf : sum;
+  }
+  return norminf;
+}
+
+/*
+ * |A| = max { |A|*ones(m,1) }
+ */
+double csr_mat_norminf(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
+                       const double *csrValA, const int *csrRowPtrA,
+                       const int *csrColIndA) {
+  const int baseA =
+      (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
+
+  double norminf = 0;
+  for (int i = 0; i < m; i++) {
+    double sum = 0.0;
+    const int start = csrRowPtrA[i] - baseA;
+    const int end = csrRowPtrA[i + 1] - baseA;
+    for (int colidx = start; colidx < end; colidx++) {
+      // const int j = csrColIndA[colidx] - baseA;
+      double A_abs = fabs(csrValA[colidx]);
+      sum += A_abs;
+    }
+    norminf = (norminf > sum) ? norminf : sum;
+  }
+  return norminf;
+}
+
+void display_matrix(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
+                    const double *csrValA, const int *csrRowPtrA,
+                    const int *csrColIndA) {
+  const int baseA =
+      (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
+
+  printf("m = %d, n = %d, nnz = %d, matlab base-1\n", m, n, nnzA);
+
+  for (int row = 0; row < m; row++) {
+    const int start = csrRowPtrA[row] - baseA;
+    const int end = csrRowPtrA[row + 1] - baseA;
+    for (int colidx = start; colidx < end; colidx++) {
+      const int col = csrColIndA[colidx] - baseA;
+      double Areg = csrValA[colidx];
+      printf("A(%d, %d) = %20.16E\n", row + 1, col + 1, Areg);
+    }
+  }
+}
+
+#if defined(_WIN32)
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+double second(void) {
+  LARGE_INTEGER t;
+  static double oofreq;
+  static int checkedForHighResTimer;
+  static BOOL hasHighResTimer;
+
+  if (!checkedForHighResTimer) {
+    hasHighResTimer = QueryPerformanceFrequency(&t);
+    oofreq = 1.0 / (double)t.QuadPart;
+    checkedForHighResTimer = 1;
+  }
+  if (hasHighResTimer) {
+    QueryPerformanceCounter(&t);
+    return (double)t.QuadPart * oofreq;
+  } else {
+    return (double)GetTickCount() / 1000.0;
+  }
+}
+
+#elif defined(__linux__) || defined(__QNX__)
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+double second(void) {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+#elif defined(__APPLE__)
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+double second(void) {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#else
+#error unsupported platform
+#endif
+
+#endif

+ 59 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_functions.h

@@ -0,0 +1,59 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_

File diff suppressed because it is too large
+ 267 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_gl.h


File diff suppressed because it is too large
+ 1001 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_image.h


File diff suppressed because it is too large
+ 1469 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_math.h


+ 543 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.cpp

@@ -0,0 +1,543 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "helper_multiprocess.h"
+#include <cstdlib>
+#include <string>
+
+int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  info->size = sz;
+  info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE, NULL,
+                                      PAGE_READWRITE, 0, (DWORD)sz, name);
+  if (info->shmHandle == 0) {
+    return GetLastError();
+  }
+
+  info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
+  if (info->addr == NULL) {
+    return GetLastError();
+  }
+
+  return 0;
+#else
+  int status = 0;
+
+  info->size = sz;
+
+  info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777);
+  if (info->shmFd < 0) {
+    return errno;
+  }
+
+  status = ftruncate(info->shmFd, sz);
+  if (status != 0) {
+    return status;
+  }
+
+  info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
+  if (info->addr == NULL) {
+    return errno;
+  }
+
+  return 0;
+#endif
+}
+
+int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  info->size = sz;
+
+  info->shmHandle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, name);
+  if (info->shmHandle == 0) {
+    return GetLastError();
+  }
+
+  info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
+  if (info->addr == NULL) {
+    return GetLastError();
+  }
+
+  return 0;
+#else
+  info->size = sz;
+
+  info->shmFd = shm_open(name, O_RDWR, 0777);
+  if (info->shmFd < 0) {
+    return errno;
+  }
+
+  info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
+  if (info->addr == NULL) {
+    return errno;
+  }
+
+  return 0;
+#endif
+}
+
+void sharedMemoryClose(sharedMemoryInfo *info) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  if (info->addr) {
+    UnmapViewOfFile(info->addr);
+  }
+  if (info->shmHandle) {
+    CloseHandle(info->shmHandle);
+  }
+#else
+  if (info->addr) {
+    munmap(info->addr, info->size);
+  }
+  if (info->shmFd) {
+    close(info->shmFd);
+  }
+#endif
+}
+
+int spawnProcess(Process *process, const char *app, char *const *args) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  STARTUPINFO si = {0};
+  BOOL status;
+  size_t arglen = 0;
+  size_t argIdx = 0;
+  std::string arg_string;
+  memset(process, 0, sizeof(*process));
+
+  while (*args) {
+    arg_string.append(*args).append(1, ' ');
+    args++;
+  }
+
+  status = CreateProcess(app, LPSTR(arg_string.c_str()), NULL, NULL, FALSE, 0,
+                         NULL, NULL, &si, process);
+
+  return status ? 0 : GetLastError();
+#else
+  *process = fork();
+  if (*process == 0) {
+    if (0 > execvp(app, args)) {
+      return errno;
+    }
+  } else if (*process < 0) {
+    return errno;
+  }
+  return 0;
+#endif
+}
+
+int waitProcess(Process *process) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  DWORD exitCode;
+  WaitForSingleObject(process->hProcess, INFINITE);
+  GetExitCodeProcess(process->hProcess, &exitCode);
+  CloseHandle(process->hProcess);
+  CloseHandle(process->hThread);
+  return (int)exitCode;
+#else
+  int status = 0;
+  do {
+    if (0 > waitpid(*process, &status, 0)) {
+      return errno;
+    }
+  } while (!WIFEXITED(status));
+  return WEXITSTATUS(status);
+#endif
+}
+
+#if defined(__linux__)
+int ipcCreateSocket(ipcHandle *&handle, const char *name,
+                    const std::vector<Process> &processes) {
+  int server_fd;
+  struct sockaddr_un servaddr;
+
+  handle = new ipcHandle;
+  memset(handle, 0, sizeof(*handle));
+  handle->socket = -1;
+  handle->socketName = NULL;
+
+  // Creating socket file descriptor
+  if ((server_fd = socket(AF_UNIX, SOCK_DGRAM, 0)) == 0) {
+    perror("IPC failure: Socket creation failed");
+    return -1;
+  }
+
+  unlink(name);
+  bzero(&servaddr, sizeof(servaddr));
+  servaddr.sun_family = AF_UNIX;
+
+  size_t len = strlen(name);
+  if (len > (sizeof(servaddr.sun_path) - 1)) {
+    perror("IPC failure: Cannot bind provided name to socket. Name too large");
+    return -1;
+  }
+
+  strncpy(servaddr.sun_path, name, len);
+
+  if (bind(server_fd, (struct sockaddr *)&servaddr, SUN_LEN(&servaddr)) < 0) {
+    perror("IPC failure: Binding socket failed");
+    return -1;
+  }
+
+  handle->socketName = new char[strlen(name) + 1];
+  strcpy(handle->socketName, name);
+  handle->socket = server_fd;
+  return 0;
+}
+
+int ipcOpenSocket(ipcHandle *&handle) {
+  int sock = 0;
+  struct sockaddr_un cliaddr;
+
+  handle = new ipcHandle;
+  memset(handle, 0, sizeof(*handle));
+
+  if ((sock = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
+    perror("IPC failure:Socket creation error");
+    return -1;
+  }
+
+  bzero(&cliaddr, sizeof(cliaddr));
+  cliaddr.sun_family = AF_UNIX;
+  char temp[10];
+
+  // Create unique name for the socket.
+  sprintf(temp, "%u", getpid());
+
+  strcpy(cliaddr.sun_path, temp);
+  if (bind(sock, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
+    perror("IPC failure: Binding socket failed");
+    return -1;
+  }
+
+  handle->socket = sock;
+  handle->socketName = new char[strlen(temp) + 1];
+  strcpy(handle->socketName, temp);
+
+  return 0;
+}
+
+int ipcCloseSocket(ipcHandle *handle) {
+  if (!handle) {
+    return -1;
+  }
+
+  if (handle->socketName) {
+    unlink(handle->socketName);
+    delete[] handle->socketName;
+  }
+  close(handle->socket);
+  delete handle;
+  return 0;
+}
+
+int ipcRecvShareableHandle(ipcHandle *handle, ShareableHandle *shHandle) {
+  struct msghdr msg = {0};
+  struct iovec iov[1];
+  struct cmsghdr cm;
+
+  // Union to guarantee alignment requirements for control array
+  union {
+    struct cmsghdr cm;
+    char control[CMSG_SPACE(sizeof(int))];
+  } control_un;
+
+  struct cmsghdr *cmptr;
+  ssize_t n;
+  int receivedfd;
+  char dummy_buffer[1];
+  ssize_t sendResult;
+
+  msg.msg_control = control_un.control;
+  msg.msg_controllen = sizeof(control_un.control);
+
+  iov[0].iov_base = (void *)dummy_buffer;
+  iov[0].iov_len = sizeof(dummy_buffer);
+
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+
+  if ((n = recvmsg(handle->socket, &msg, 0)) <= 0) {
+    perror("IPC failure: Receiving data over socket failed");
+    return -1;
+  }
+
+  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) &&
+      (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+      return -1;
+    }
+
+    memmove(&receivedfd, CMSG_DATA(cmptr), sizeof(receivedfd));
+    *(int *)shHandle = receivedfd;
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int ipcRecvDataFromClient(ipcHandle *serverHandle, void *data, size_t size) {
+  ssize_t readResult;
+  struct sockaddr_un cliaddr;
+  socklen_t len = sizeof(cliaddr);
+
+  readResult = recvfrom(serverHandle->socket, data, size, 0,
+                        (struct sockaddr *)&cliaddr, &len);
+  if (readResult == -1) {
+    perror("IPC failure: Receiving data over socket failed");
+    return -1;
+  }
+  return 0;
+}
+
+int ipcSendDataToServer(ipcHandle *handle, const char *serverName,
+                        const void *data, size_t size) {
+  ssize_t sendResult;
+  struct sockaddr_un serveraddr;
+
+  bzero(&serveraddr, sizeof(serveraddr));
+  serveraddr.sun_family = AF_UNIX;
+  strncpy(serveraddr.sun_path, serverName, sizeof(serveraddr.sun_path) - 1);
+
+  sendResult = sendto(handle->socket, data, size, 0,
+                      (struct sockaddr *)&serveraddr, sizeof(serveraddr));
+  if (sendResult <= 0) {
+    perror("IPC failure: Sending data over socket failed");
+  }
+
+  return 0;
+}
+
+int ipcSendShareableHandle(ipcHandle *handle,
+                           const std::vector<ShareableHandle> &shareableHandles,
+                           Process process, int data) {
+  struct msghdr msg;
+  struct iovec iov[1];
+
+  union {
+    struct cmsghdr cm;
+    char control[CMSG_SPACE(sizeof(int))];
+  } control_un;
+
+  struct cmsghdr *cmptr;
+  ssize_t readResult;
+  struct sockaddr_un cliaddr;
+  socklen_t len = sizeof(cliaddr);
+
+  // Construct client address to send this SHareable handle to
+  bzero(&cliaddr, sizeof(cliaddr));
+  cliaddr.sun_family = AF_UNIX;
+  char temp[10];
+  sprintf(temp, "%u", process);
+  strcpy(cliaddr.sun_path, temp);
+  len = sizeof(cliaddr);
+
+  // Send corresponding shareable handle to the client
+  int sendfd = (int)shareableHandles[data];
+
+  msg.msg_control = control_un.control;
+  msg.msg_controllen = sizeof(control_un.control);
+
+  cmptr = CMSG_FIRSTHDR(&msg);
+  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+  cmptr->cmsg_level = SOL_SOCKET;
+  cmptr->cmsg_type = SCM_RIGHTS;
+
+  memmove(CMSG_DATA(cmptr), &sendfd, sizeof(sendfd));
+
+  msg.msg_name = (void *)&cliaddr;
+  msg.msg_namelen = sizeof(struct sockaddr_un);
+
+  iov[0].iov_base = (void *)"";
+  iov[0].iov_len = 1;
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+
+  ssize_t sendResult = sendmsg(handle->socket, &msg, 0);
+  if (sendResult <= 0) {
+    perror("IPC failure: Sending data over socket failed");
+    return -1;
+  }
+
+  return 0;
+}
+
+int ipcSendShareableHandles(
+    ipcHandle *handle, const std::vector<ShareableHandle> &shareableHandles,
+    const std::vector<Process> &processes) {
+  // Send all shareable handles to every single process.
+  for (int i = 0; i < shareableHandles.size(); i++) {
+    for (int j = 0; j < processes.size(); j++) {
+      checkIpcErrors(
+          ipcSendShareableHandle(handle, shareableHandles, processes[j], i));
+    }
+  }
+  return 0;
+}
+
+int ipcRecvShareableHandles(ipcHandle *handle,
+                            std::vector<ShareableHandle> &shareableHandles) {
+  for (int i = 0; i < shareableHandles.size(); i++) {
+    checkIpcErrors(ipcRecvShareableHandle(handle, &shareableHandles[i]));
+  }
+  return 0;
+}
+
+int ipcCloseShareableHandle(ShareableHandle shHandle) {
+  return close(shHandle);
+}
+
+#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// Generic name to build individual Mailslot names by appending process ids.
+LPTSTR SlotName = (LPTSTR)TEXT("\\\\.\\mailslot\\sample_mailslot_");
+
+int ipcCreateSocket(ipcHandle *&handle, const char *name,
+                    const std::vector<Process> &processes) {
+  handle = new ipcHandle;
+  handle->hMailslot.resize(processes.size());
+
+  // Open Mailslots of all clients and store respective handles.
+  for (int i = 0; i < handle->hMailslot.size(); ++i) {
+    std::basic_string<TCHAR> childSlotName(SlotName);
+    char tempBuf[20];
+    _itoa_s(processes[i].dwProcessId, tempBuf, 10);
+    childSlotName += TEXT(tempBuf);
+
+    HANDLE hFile =
+        CreateFile(TEXT(childSlotName.c_str()), GENERIC_WRITE, FILE_SHARE_READ,
+                   (LPSECURITY_ATTRIBUTES)NULL, OPEN_EXISTING,
+                   FILE_ATTRIBUTE_NORMAL, (HANDLE)NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+      printf("IPC failure: Opening Mailslot by CreateFile failed with %d\n",
+             GetLastError());
+      return -1;
+    }
+    handle->hMailslot[i] = hFile;
+  }
+  return 0;
+}
+
+int ipcOpenSocket(ipcHandle *&handle) {
+  handle = new ipcHandle;
+  HANDLE hSlot;
+
+  std::basic_string<TCHAR> clientSlotName(SlotName);
+  char tempBuf[20];
+  _itoa_s(GetCurrentProcessId(), tempBuf, 10);
+  clientSlotName += TEXT(tempBuf);
+
+  hSlot = CreateMailslot((LPSTR)clientSlotName.c_str(), 0,
+                         MAILSLOT_WAIT_FOREVER, (LPSECURITY_ATTRIBUTES)NULL);
+  if (hSlot == INVALID_HANDLE_VALUE) {
+    printf("IPC failure: CreateMailslot failed for client with %d\n",
+           GetLastError());
+    return -1;
+  }
+
+  handle->hMailslot.push_back(hSlot);
+  return 0;
+}
+
+int ipcSendData(HANDLE mailslot, const void *data, size_t sz) {
+  BOOL result;
+  DWORD cbWritten;
+
+  result = WriteFile(mailslot, data, (DWORD)sz, &cbWritten, (LPOVERLAPPED)NULL);
+  if (!result) {
+    printf("IPC failure: WriteFile failed with %d.\n", GetLastError());
+    return -1;
+  }
+  return 0;
+}
+
+int ipcRecvData(ipcHandle *handle, void *data, size_t sz) {
+  DWORD cbRead = 0;
+
+  if (!ReadFile(handle->hMailslot[0], data, (DWORD)sz, &cbRead, NULL)) {
+    printf("IPC failure: ReadFile failed with %d.\n", GetLastError());
+    return -1;
+  }
+
+  if (sz != (size_t)cbRead) {
+    printf(
+        "IPC failure: ReadFile didn't receive the expected number of bytes\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+int ipcSendShareableHandles(
+    ipcHandle *handle, const std::vector<ShareableHandle> &shareableHandles,
+    const std::vector<Process> &processes) {
+  // Send all shareable handles to every single process.
+  for (int i = 0; i < processes.size(); i++) {
+    HANDLE hProcess =
+        OpenProcess(PROCESS_DUP_HANDLE, FALSE, processes[i].dwProcessId);
+    if (hProcess == INVALID_HANDLE_VALUE) {
+      printf("IPC failure: OpenProcess failed (%d)\n", GetLastError());
+      return -1;
+    }
+
+    for (int j = 0; j < shareableHandles.size(); j++) {
+      HANDLE hDup = INVALID_HANDLE_VALUE;
+      // Duplicate the handle into the target process's space
+      if (!DuplicateHandle(GetCurrentProcess(), shareableHandles[j], hProcess,
+                           &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS)) {
+        printf("IPC failure: DuplicateHandle failed (%d)\n", GetLastError());
+        return -1;
+      }
+      checkIpcErrors(ipcSendData(handle->hMailslot[i], &hDup, sizeof(hDup)));
+    }
+    CloseHandle(hProcess);
+  }
+  return 0;
+}
+
+int ipcRecvShareableHandles(ipcHandle *handle,
+                            std::vector<ShareableHandle> &shareableHandles) {
+  for (int i = 0; i < shareableHandles.size(); i++) {
+    checkIpcErrors(
+        ipcRecvData(handle, &shareableHandles[i], sizeof(shareableHandles[i])));
+  }
+  return 0;
+}
+
+int ipcCloseSocket(ipcHandle *handle) {
+  for (int i = 0; i < handle->hMailslot.size(); i++) {
+    CloseHandle(handle->hMailslot[i]);
+  }
+  delete handle;
+  return 0;
+}
+
+int ipcCloseShareableHandle(ShareableHandle shHandle) {
+  CloseHandle(shHandle);
+  return 0;
+}
+
+#endif

+ 120 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_multiprocess.h

@@ -0,0 +1,120 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HELPER_MULTIPROCESS_H
+#define HELPER_MULTIPROCESS_H
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#include <iostream>
+#include <stdio.h>
+#include <tchar.h>
+#include <strsafe.h>
+#include <sddl.h>
+#include <aclapi.h>
+#include <winternl.h>
+#else
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <memory.h>
+#include <sys/un.h>
+#endif
+#include <vector>
+
+typedef struct sharedMemoryInfo_st {
+    void *addr;
+    size_t size;
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    HANDLE shmHandle;
+#else
+    int shmFd;
+#endif
+} sharedMemoryInfo;
+
+int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
+
+int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
+
+void sharedMemoryClose(sharedMemoryInfo *info);
+
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+typedef PROCESS_INFORMATION Process;
+#else
+typedef pid_t Process;
+#endif
+
+int spawnProcess(Process *process, const char *app, char * const *args);
+
+int waitProcess(Process *process);
+
+#define checkIpcErrors(ipcFuncResult) \
+    if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }
+
+#if defined(__linux__)
+struct ipcHandle_st {
+    int socket;
+    char *socketName;
+};
+typedef int ShareableHandle;
+#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+struct ipcHandle_st {
+    std::vector<HANDLE> hMailslot; // 1 Handle in case of child and `num children` Handles for parent.
+};
+typedef HANDLE ShareableHandle;
+#endif
+
+typedef struct ipcHandle_st ipcHandle;
+
+int
+ipcCreateSocket(ipcHandle *&handle, const char *name, const std::vector<Process>& processes);
+
+int
+ipcOpenSocket(ipcHandle *&handle);
+
+int
+ipcCloseSocket(ipcHandle *handle);
+
+int
+ipcRecvShareableHandles(ipcHandle *handle, std::vector<ShareableHandle>& shareableHandles);
+
+int
+ipcSendShareableHandles(ipcHandle *handle, const std::vector<ShareableHandle>& shareableHandles, const std::vector<Process>& processes);
+
+int
+ipcCloseShareableHandle(ShareableHandle shHandle);
+
+#endif // HELPER_MULTIPROCESS_H

+ 428 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_nvJPEG.hxx

@@ -0,0 +1,428 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// This sample needs at least CUDA 10.1.
+// It demonstrates usages of the nvJPEG library
+
+#ifndef NV_JPEG_EXAMPLE
+#define NV_JPEG_EXAMPLE
+
+#ifdef _WIN64
+#include <windows.h>
+#endif
+
+#include "cuda_runtime.h"
+#include "nvjpeg.h"
+#include "helper_cuda.h"
+#include "helper_timer.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <string.h>  // strcmpi
+#ifndef _WIN64
+#include <sys/time.h>  // timings
+
+#include <dirent.h>  // linux dir traverse
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+#include <sys/types.h>
+
+// write bmp, input - RGB, device
+int writeBMP(const char *filename, const unsigned char *d_chanR, int pitchR,
+             const unsigned char *d_chanG, int pitchG,
+             const unsigned char *d_chanB, int pitchB, int width, int height) {
+  unsigned int headers[13];
+  FILE *outfile;
+  int extrabytes;
+  int paddedsize;
+  int x;
+  int y;
+  int n;
+  int red, green, blue;
+
+  std::vector<unsigned char> vchanR(height * width);
+  std::vector<unsigned char> vchanG(height * width);
+  std::vector<unsigned char> vchanB(height * width);
+  unsigned char *chanR = vchanR.data();
+  unsigned char *chanG = vchanG.data();
+  unsigned char *chanB = vchanB.data();
+  checkCudaErrors(cudaMemcpy2D(chanR, (size_t)width, d_chanR, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy2D(chanG, (size_t)width, d_chanG, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy2D(chanB, (size_t)width, d_chanB, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+
+  extrabytes =
+      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
+  // horizontal line - the size of which must
+  // be a multiple of 4 bytes.
+  if (extrabytes == 4) extrabytes = 0;
+
+  paddedsize = ((width * 3) + extrabytes) * height;
+
+  // Headers...
+  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
+  // "headers".
+
+  headers[0] = paddedsize + 54;  // bfSize (whole file size)
+  headers[1] = 0;                // bfReserved (both)
+  headers[2] = 54;               // bfOffbits
+  headers[3] = 40;               // biSize
+  headers[4] = width;            // biWidth
+  headers[5] = height;           // biHeight
+
+  // Would have biPlanes and biBitCount in position 6, but they're shorts.
+  // It's easier to write them out separately (see below) than pretend
+  // they're a single int, especially with endian issues...
+
+  headers[7] = 0;           // biCompression
+  headers[8] = paddedsize;  // biSizeImage
+  headers[9] = 0;           // biXPelsPerMeter
+  headers[10] = 0;          // biYPelsPerMeter
+  headers[11] = 0;          // biClrUsed
+  headers[12] = 0;          // biClrImportant
+
+  if (!(outfile = fopen(filename, "wb"))) {
+    std::cerr << "Cannot open file: " << filename << std::endl;
+    return 1;
+  }
+
+  //
+  // Headers begin...
+  // When printing ints and shorts, we write out 1 character at a time to avoid
+  // endian issues.
+  //
+  fprintf(outfile, "BM");
+
+  for (n = 0; n <= 5; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  // These next 4 characters are for the biPlanes and biBitCount fields.
+
+  fprintf(outfile, "%c", 1);
+  fprintf(outfile, "%c", 0);
+  fprintf(outfile, "%c", 24);
+  fprintf(outfile, "%c", 0);
+
+  for (n = 7; n <= 12; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  //
+  // Headers done, now write the data...
+  //
+
+  for (y = height - 1; y >= 0;
+       y--)  // BMP image format is written from bottom to top...
+  {
+    for (x = 0; x <= width - 1; x++) {
+      red = chanR[y * width + x];
+      green = chanG[y * width + x];
+      blue = chanB[y * width + x];
+
+      if (red > 255) red = 255;
+      if (red < 0) red = 0;
+      if (green > 255) green = 255;
+      if (green < 0) green = 0;
+      if (blue > 255) blue = 255;
+      if (blue < 0) blue = 0;
+      // Also, it's written in (b,g,r) format...
+
+      fprintf(outfile, "%c", blue);
+      fprintf(outfile, "%c", green);
+      fprintf(outfile, "%c", red);
+    }
+    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
+    {
+      for (n = 1; n <= extrabytes; n++) {
+        fprintf(outfile, "%c", 0);
+      }
+    }
+  }
+
+  fclose(outfile);
+  return 0;
+}
+
+// write bmp, input - RGB, device
+int writeBMPi(const char *filename, const unsigned char *d_RGB, int pitch,
+              int width, int height) {
+  unsigned int headers[13];
+  FILE *outfile;
+  int extrabytes;
+  int paddedsize;
+  int x;
+  int y;
+  int n;
+  int red, green, blue;
+
+  std::vector<unsigned char> vchanRGB(height * width * 3);
+  unsigned char *chanRGB = vchanRGB.data();
+  checkCudaErrors(cudaMemcpy2D(chanRGB, (size_t)width * 3, d_RGB, (size_t)pitch,
+                               width * 3, height, cudaMemcpyDeviceToHost));
+
+  extrabytes =
+      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
+  // horizontal line - the size of which must
+  // be a multiple of 4 bytes.
+  if (extrabytes == 4) extrabytes = 0;
+
+  paddedsize = ((width * 3) + extrabytes) * height;
+
+  // Headers...
+  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
+  // "headers".
+  headers[0] = paddedsize + 54;  // bfSize (whole file size)
+  headers[1] = 0;                // bfReserved (both)
+  headers[2] = 54;               // bfOffbits
+  headers[3] = 40;               // biSize
+  headers[4] = width;            // biWidth
+  headers[5] = height;           // biHeight
+
+  // Would have biPlanes and biBitCount in position 6, but they're shorts.
+  // It's easier to write them out separately (see below) than pretend
+  // they're a single int, especially with endian issues...
+
+  headers[7] = 0;           // biCompression
+  headers[8] = paddedsize;  // biSizeImage
+  headers[9] = 0;           // biXPelsPerMeter
+  headers[10] = 0;          // biYPelsPerMeter
+  headers[11] = 0;          // biClrUsed
+  headers[12] = 0;          // biClrImportant
+
+  if (!(outfile = fopen(filename, "wb"))) {
+    std::cerr << "Cannot open file: " << filename << std::endl;
+    return 1;
+  }
+
+  //
+  // Headers begin...
+  // When printing ints and shorts, we write out 1 character at a time to avoid
+  // endian issues.
+  //
+
+  fprintf(outfile, "BM");
+
+  for (n = 0; n <= 5; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  // These next 4 characters are for the biPlanes and biBitCount fields.
+
+  fprintf(outfile, "%c", 1);
+  fprintf(outfile, "%c", 0);
+  fprintf(outfile, "%c", 24);
+  fprintf(outfile, "%c", 0);
+
+  for (n = 7; n <= 12; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  //
+  // Headers done, now write the data...
+  //
+  for (y = height - 1; y >= 0;
+       y--)  // BMP image format is written from bottom to top...
+  {
+    for (x = 0; x <= width - 1; x++) {
+      red = chanRGB[(y * width + x) * 3];
+      green = chanRGB[(y * width + x) * 3 + 1];
+      blue = chanRGB[(y * width + x) * 3 + 2];
+
+      if (red > 255) red = 255;
+      if (red < 0) red = 0;
+      if (green > 255) green = 255;
+      if (green < 0) green = 0;
+      if (blue > 255) blue = 255;
+      if (blue < 0) blue = 0;
+      // Also, it's written in (b,g,r) format...
+
+      fprintf(outfile, "%c", blue);
+      fprintf(outfile, "%c", green);
+      fprintf(outfile, "%c", red);
+    }
+    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
+    {
+      for (n = 1; n <= extrabytes; n++) {
+        fprintf(outfile, "%c", 0);
+      }
+    }
+  }
+
+  fclose(outfile);
+  return 0;
+}
+
+int inputDirExists(const char *pathname) {
+  struct stat info;
+  if (stat(pathname, &info) != 0) {
+    return 0;  // Directory does not exists
+  } else if (info.st_mode & S_IFDIR) {
+    // is a directory
+    return 1;
+  } else {
+    // is not a directory
+    return 0;
+  }
+}
+
+int readInput(const std::string &sInputPath,
+              std::vector<std::string> &filelist) {
+  int error_code = 1;
+#ifndef _WIN64
+  struct stat s;
+
+  if (stat(sInputPath.c_str(), &s) == 0) {
+    if (s.st_mode & S_IFREG) {
+      filelist.push_back(sInputPath);
+    } else if (s.st_mode & S_IFDIR) {
+      // processing each file in directory
+      DIR *dir_handle;
+      struct dirent *dir;
+      dir_handle = opendir(sInputPath.c_str());
+      std::vector<std::string> filenames;
+      if (dir_handle) {
+        error_code = 0;
+        while ((dir = readdir(dir_handle)) != NULL) {
+          std::string sFileName = sInputPath + dir->d_name;
+          if (inputDirExists(sFileName.c_str()))
+          {
+            std::string sname = dir->d_name;
+            if (sname != "." && sname != "..") {
+              readInput(sInputPath + sname + "/", filelist);
+            }
+          }
+          else
+          {
+            filelist.push_back(sFileName);
+          }
+        }
+        closedir(dir_handle);
+      } else {
+        std::cout << "Cannot open input directory: " << sInputPath << std::endl;
+        return error_code;
+      }
+    } else {
+      std::cout << "Cannot open input: " << sInputPath << std::endl;
+      return error_code;
+    }
+  } else {
+    std::cout << "Cannot find input path " << sInputPath << std::endl;
+    return error_code;
+  }
+#else
+  std::string search_path = sInputPath + "/*.*";
+  WIN32_FIND_DATA fd;
+  HANDLE hFind = ::FindFirstFile(search_path.c_str(), &fd);
+  if (hFind != INVALID_HANDLE_VALUE) {
+    do {
+      // read all (real) files in current folder
+      // , delete '!' read other 2 default folder . and ..
+      if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        std::string temp(sInputPath + "\\" + fd.cFileName);
+        filelist.push_back(temp);
+      }
+    } while (::FindNextFile(hFind, &fd));
+    ::FindClose(hFind);
+  } else {
+    std::cout << "Cannot open input directory: " << sInputPath << std::endl;
+    return error_code;
+  }
+#endif
+  return 0;
+}
+
+
+int getInputDir(std::string &input_dir, const char *executable_path) {
+  int found = 0;
+  if (executable_path != 0) {
+    std::string executable_name = std::string(executable_path);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+
+    // Search in default paths for input images.
+    std::string pathname = "";
+    const char *searchPath[] = {
+        "./images", "../../../../Samples/<executable_name>/images",
+        "../../../Samples/<executable_name>/images",
+        "../../Samples/<executable_name>/images"};
+
+
+    for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+      std::string pathname(searchPath[i]);
+      size_t executable_name_pos = pathname.find("<executable_name>");
+
+      // If there is executable_name variable in the searchPath
+      // replace it with the value
+      if (executable_name_pos != std::string::npos) {
+        pathname.replace(executable_name_pos, strlen("<executable_name>"),
+                         executable_name);
+      }
+
+      if (inputDirExists(pathname.c_str())) {
+        input_dir = pathname + "/";
+        found = 1;
+        break;
+      }
+    }
+  }
+  return found;
+}
+
+#endif

+ 368 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_string.h

@@ -0,0 +1,368 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                          // same dir
+      "./data/",                                      // same dir
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_

+ 465 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/helper_timer.h

@@ -0,0 +1,465 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/freeglut.lib


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/lib/x64/glew64.lib


+ 200 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/nvrtc_helper.h

@@ -0,0 +1,200 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_NVRTC_HELPER_H_
+
+#define COMMON_NVRTC_HELPER_H_ 1
+
+#include <cuda.h>
+#include <helper_cuda_drvapi.h>
+#include <nvrtc.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#define NVRTC_SAFE_CALL(Name, x)                                \
+  do {                                                          \
+    nvrtcResult result = x;                                     \
+    if (result != NVRTC_SUCCESS) {                              \
+      std::cerr << "\nerror: " << Name << " failed with error " \
+                << nvrtcGetErrorString(result);                 \
+      exit(1);                                                  \
+    }                                                           \
+  } while (0)
+
+void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult,
+                      size_t *cubinResultSize, int requiresCGheaders) {
+  std::ifstream inputFile(filename,
+                          std::ios::in | std::ios::binary | std::ios::ate);
+
+  if (!inputFile.is_open()) {
+    std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
+    exit(1);
+  }
+
+  std::streampos pos = inputFile.tellg();
+  size_t inputSize = (size_t)pos;
+  char *memBlock = new char[inputSize + 1];
+
+  inputFile.seekg(0, std::ios::beg);
+  inputFile.read(memBlock, inputSize);
+  inputFile.close();
+  memBlock[inputSize] = '\x0';
+
+  int numCompileOptions = 0;
+
+  char *compileParams[2];
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+
+  // Picks the best CUDA device available
+  CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+
+  // get compute capabilities and the devicename
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  
+  {
+  // Compile cubin for the GPU arch on which are going to run cuda kernel.
+  std::string compileOptions;
+  compileOptions = "--gpu-architecture=sm_";
+
+  compileParams[numCompileOptions] = reinterpret_cast<char *>(
+                  malloc(sizeof(char) * (compileOptions.length() + 10)));
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 10),
+            "%s%d%d", compileOptions.c_str(), major, minor);
+#else
+  snprintf(compileParams[numCompileOptions], compileOptions.size() + 10, "%s%d%d",
+           compileOptions.c_str(), major, minor);
+#endif
+  }
+
+  numCompileOptions++;
+
+  if (requiresCGheaders) {
+    std::string compileOptions;
+    char HeaderNames[256];
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
+#else
+    snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
+#endif
+
+    compileOptions = "--include-path=";
+
+    std::string path = sdkFindFilePath(HeaderNames, argv[0]);
+    if (!path.empty()) {
+      std::size_t found = path.find(HeaderNames);
+      path.erase(found);
+    } else {
+      printf(
+          "\nCooperativeGroups headers not found, please install it in %s "
+          "sample directory..\n Exiting..\n",
+          argv[0]);
+    }
+    compileOptions += path.c_str();
+    compileParams[numCompileOptions] = reinterpret_cast<char *>(
+        malloc(sizeof(char) * (compileOptions.length() + 1)));
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 1),
+              "%s", compileOptions.c_str());
+#else
+    snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s",
+             compileOptions.c_str());
+#endif
+    numCompileOptions++;
+  }
+
+  // compile
+  nvrtcProgram prog;
+  NVRTC_SAFE_CALL("nvrtcCreateProgram",
+                  nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
+
+  nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
+
+  // dump log
+  size_t logSize;
+  NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
+                  nvrtcGetProgramLogSize(prog, &logSize));
+  char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
+  NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
+  log[logSize] = '\x0';
+
+  if (strlen(log) >= 2) {
+    std::cerr << "\n compilation log ---\n";
+    std::cerr << log;
+    std::cerr << "\n end log ---\n";
+  }
+
+  free(log);
+
+  NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
+
+  size_t codeSize;
+  NVRTC_SAFE_CALL("nvrtcGetCUBINSize", nvrtcGetCUBINSize(prog, &codeSize));
+  char *code = new char[codeSize];
+  NVRTC_SAFE_CALL("nvrtcGetCUBIN", nvrtcGetCUBIN(prog, code));
+  *cubinResult = code;
+  *cubinResultSize = codeSize;
+
+  for (int i = 0; i < numCompileOptions; i++) {
+    free(compileParams[i]);
+  }
+}
+
+CUmodule loadCUBIN(char *cubin, int argc, char **argv) {
+  CUmodule module;
+  CUcontext context;
+  int major = 0, minor = 0;
+  char deviceName[256];
+
+  // Picks the best CUDA device available
+  CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+
+  // get compute capabilities and the devicename
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
+  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
+
+  checkCudaErrors(cuInit(0));
+  checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
+
+  checkCudaErrors(cuModuleLoadData(&module, cubin));
+  free(cubin);
+
+  return module;
+}
+
+#endif  // COMMON_NVRTC_HELPER_H_

+ 124 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.cpp

@@ -0,0 +1,124 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
+//  In addition, wraps up a threshold comparision of two PPMs.
+//
+//  These functions are designed to be used to implement an automated QA testing for SDK samples.
+//
+//  Author: Bryan Dudash
+//  Email: sdkfeedback@nvidia.com
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <helper_functions.h>
+#include <rendercheck_d3d11.h>
+
+HRESULT CheckRenderD3D11::ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName)
+{
+    ID3D11DeviceContext *pDeviceCtxt;
+    pDevice->GetImmediateContext(&pDeviceCtxt);
+    ID3D11RenderTargetView *pRTV = NULL;
+    pDeviceCtxt->OMGetRenderTargets(1,&pRTV,NULL);
+
+    ID3D11Resource *pSourceResource = NULL;
+    pRTV->GetResource(&pSourceResource);
+
+    return ResourceToPPM(pDevice,pSourceResource,zFileName);
+}
+
+HRESULT CheckRenderD3D11::ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName)
+{
+    ID3D11DeviceContext *pDeviceCtxt;
+    pDevice->GetImmediateContext(&pDeviceCtxt);
+    D3D11_RESOURCE_DIMENSION rType;
+    pResource->GetType(&rType);
+
+    if (rType != D3D11_RESOURCE_DIMENSION_TEXTURE2D)
+    {
+        printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
+        return E_FAIL;
+    }
+
+    ID3D11Texture2D *pSourceTexture = (ID3D11Texture2D *)pResource;
+    ID3D11Texture2D *pTargetTexture = NULL;
+
+    D3D11_TEXTURE2D_DESC desc;
+    pSourceTexture->GetDesc(&desc);
+    desc.BindFlags = 0;
+    desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+    desc.Usage = D3D11_USAGE_STAGING;
+
+    if (FAILED(pDevice->CreateTexture2D(&desc,NULL,&pTargetTexture)))
+    {
+        printf("SurfaceToPPM: Unable to create target Texture resoruce! Aborting... \n");
+        return E_FAIL;
+    }
+
+    pDeviceCtxt->CopyResource(pTargetTexture,pSourceTexture);
+
+    D3D11_MAPPED_SUBRESOURCE mappedTex2D;
+    pDeviceCtxt->Map(pTargetTexture, 0, D3D11_MAP_READ,0,&mappedTex2D);
+
+    // Need to convert from dx pitch to pitch=width
+    unsigned char *pPPMData = new unsigned char[desc.Width*desc.Height*4];
+
+    for (unsigned int iHeight = 0; iHeight<desc.Height; iHeight++)
+    {
+        memcpy(&(pPPMData[iHeight*desc.Width*4]),(unsigned char *)(mappedTex2D.pData)+iHeight*mappedTex2D.RowPitch,desc.Width*4);
+    }
+
+    pDeviceCtxt->Unmap(pTargetTexture, 0);
+
+    // Prepends the PPM header info and bumps byte data afterwards
+    sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
+
+    delete [] pPPMData;
+    pTargetTexture->Release();
+
+    return S_OK;
+}
+
+bool CheckRenderD3D11::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
+                                const float epsilon, const float threshold)
+{
+    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+    if (ref_file_path == NULL)
+    {
+        printf("CheckRenderD3D11::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path);
+        printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
+        printf("Aborting comparison!\n");
+        printf("  FAILURE!\n");
+        return false;
+    }
+
+    return sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true;
+}

+ 52 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Common/rendercheck_d3d11.h

@@ -0,0 +1,52 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef _RENDERCHECK_D3D11_H_
+#define _RENDERCHECK_D3D11_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <d3d11.h>
+
+class CheckRenderD3D11
+{
+    public:
+
+        CheckRenderD3D11() {}
+
+        static HRESULT ActiveRenderTargetToPPM(ID3D11Device  *pDevice, const char *zFileName);
+        static HRESULT ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName);
+
+        static bool PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
+                             const float epsilon, const float threshold = 0.0f);
+};
+
+#endif

+ 337 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/Makefile

@@ -0,0 +1,337 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I./Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 70 72 75 80 86
+else
+SMS ?= 70 75 80 86
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: p2pBandwidthLatencyTest
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./p2pBandwidthLatencyTest
+
+clean:
+	rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/p2pBandwidthLatencyTest
+
+clobber: clean

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest


+ 695 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu

@@ -0,0 +1,695 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cstdio>
+#include <vector>
+
+#include <helper_cuda.h>
+#include <helper_timer.h>
+
+using namespace std;
+
+const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";
+
+typedef enum {
+  P2P_WRITE = 0,
+  P2P_READ = 1,
+} P2PDataTransfer;
+
+typedef enum {
+  CE = 0,
+  SM = 1,
+} P2PEngine;
+
+P2PEngine p2p_mechanism = CE;  // By default use Copy Engine
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(EXIT_FAILURE);                                      \
+    }                                                          \
+  }
+__global__ void delay(volatile int *flag,
+                      unsigned long long timeout_clocks = 10000000) {
+  // Wait until the application notifies us that it has completed queuing up the
+  // experiment, or timeout and exit, allowing the application to make progress
+  long long int start_clock, sample_clock;
+  start_clock = clock64();
+
+  while (!*flag) {
+    sample_clock = clock64();
+
+    if (sample_clock - start_clock > timeout_clocks) {
+      break;
+    }
+  }
+}
+
+// This kernel is for demonstration purposes only, not a performant kernel for
+// p2p transfers.
+__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src,
+                        size_t num_elems) {
+  size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t gridSize = blockDim.x * gridDim.x;
+
+#pragma unroll(5)
+  for (size_t i = globalId; i < num_elems; i += gridSize) {
+    dest[i] = src[i];
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Print help screen
+///////////////////////////////////////////////////////////////////////////
+void printHelp(void) {
+  printf("Usage:  p2pBandwidthLatencyTest [OPTION]...\n");
+  printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
+  printf("\n");
+
+  printf("Options:\n");
+  printf("--help\t\tDisplay this help menu\n");
+  printf(
+      "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "
+      "corresponding results.\n \t\tDefault used is P2P write operation.\n");
+  printf("--sm_copy                      Use SM intiated p2p transfers instead of Copy Engine\n");
+  printf("--numElems=<NUM_OF_INT_ELEMS>  Number of integer elements to be used in p2p copy.\n");
+}
+
+void checkP2Paccess(int numGPUs) {
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+    cudaCheckError();
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access;
+      if (i != j) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        cudaCheckError();
+        printf("Device=%d %s Access Peer Device=%d\n", i,
+               access ? "CAN" : "CANNOT", j);
+      }
+    }
+  }
+  printf(
+      "\n***NOTE: In case a device doesn't have P2P access to other one, it "
+      "falls back to normal memcopy procedure.\nSo you can see lesser "
+      "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
+}
+
+void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,
+                    int num_elems, int repeat, bool p2paccess,
+                    cudaStream_t streamToRun) {
+  int blockSize = 0;
+  int numBlocks = 0;
+
+  cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
+  cudaCheckError();
+
+  if (p2p_mechanism == SM && p2paccess) {
+    for (int r = 0; r < repeat; r++) {
+      copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(
+          (int4 *)dest, (int4 *)src, num_elems / 4);
+    }
+  } else {
+    for (int r = 0; r < repeat; r++) {
+      cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,
+                          sizeof(int) * num_elems, streamToRun);
+    }
+  }
+}
+
+void outputBandwidthMatrix(int numElems, int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
+  int repeat = 5;
+  volatile int *flag = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+  vector<cudaStream_t> stream(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
+    cudaMalloc(&buffers[d], numElems * sizeof(int));
+    cudaCheckError();
+    cudaMemset(buffers[d], 0, numElems * sizeof(int));
+    cudaCheckError();
+    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
+    cudaCheckError();
+    cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+  }
+
+  vector<double> bandwidthMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaCheckError();
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaCheckError();
+          cudaSetDevice(i);
+          cudaCheckError();
+        }
+      }
+
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      delay<<<1, 1, 0, stream[i]>>>(flag);
+      cudaCheckError();
+      cudaEventRecord(start[i], stream[i]);
+      cudaCheckError();
+
+      if (i == j) {
+        // Perform intra-GPU, D2D copies
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
+                       access, stream[i]);
+
+      } else {
+        if (p2p_method == P2P_WRITE) {
+          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
+                         stream[i]);
+        } else {
+          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
+                         stream[i]);
+        }
+      }
+
+      cudaEventRecord(stop[i], stream[i]);
+      cudaCheckError();
+
+      // Release the queued events
+      *flag = 1;
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      float time_ms;
+      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
+      double time_s = time_ms / 1e3;
+
+      double gb = numElems * sizeof(int) * repeat / (double)1e9;
+      if (i == j) {
+        gb *= 2;  // must count both the read and the write here
+      }
+      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
+      if (p2p && access) {
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+        cudaSetDevice(i);
+        cudaCheckError();
+      }
+    }
+  }
+
+  printf("   D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream[d]);
+    cudaCheckError();
+  }
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+void outputBidirectionalBandwidthMatrix(int numElems, int numGPUs, bool p2p) {
+  int repeat = 5;
+  volatile int *flag = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+  vector<cudaStream_t> stream0(numGPUs);
+  vector<cudaStream_t> stream1(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaMalloc(&buffers[d], numElems * sizeof(int));
+    cudaMemset(buffers[d], 0, numElems * sizeof(int));
+    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
+    cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+    cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
+    cudaCheckError();
+    cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
+    cudaCheckError();
+  }
+
+  vector<double> bandwidthMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaSetDevice(i);
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaCheckError();
+        }
+      }
+
+      cudaSetDevice(i);
+      cudaStreamSynchronize(stream0[i]);
+      cudaStreamSynchronize(stream1[j]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      cudaSetDevice(i);
+      // No need to block stream1 since it'll be blocked on stream0's event
+      delay<<<1, 1, 0, stream0[i]>>>(flag);
+      cudaCheckError();
+
+      // Force stream1 not to start until stream0 does, in order to ensure
+      // the events on stream0 fully encompass the time needed for all
+      // operations
+      cudaEventRecord(start[i], stream0[i]);
+      cudaStreamWaitEvent(stream1[j], start[i], 0);
+
+      if (i == j) {
+        // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
+                       access, stream0[i]);
+        performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,
+                       access, stream1[i]);
+      } else {
+        if (access && p2p_mechanism == SM) {
+          cudaSetDevice(j);
+        }
+        performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
+                       stream1[j]);
+        if (access && p2p_mechanism == SM) {
+          cudaSetDevice(i);
+        }
+        performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
+                       stream0[i]);
+      }
+
+      // Notify stream0 that stream1 is complete and record the time of
+      // the total transaction
+      cudaEventRecord(stop[j], stream1[j]);
+      cudaStreamWaitEvent(stream0[i], stop[j], 0);
+      cudaEventRecord(stop[i], stream0[i]);
+
+      // Release the queued operations
+      *flag = 1;
+      cudaStreamSynchronize(stream0[i]);
+      cudaStreamSynchronize(stream1[j]);
+      cudaCheckError();
+
+      float time_ms;
+      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
+      double time_s = time_ms / 1e3;
+
+      double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
+      if (i == j) {
+        gb *= 2;  // must count both the read and the write here
+      }
+      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
+      if (p2p && access) {
+        cudaSetDevice(i);
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+      }
+    }
+  }
+
+  printf("   D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream0[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream1[d]);
+    cudaCheckError();
+  }
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
+  int repeat = 100;
+  int numElems = 4;  // perform 1-int4 transfer.
+  volatile int *flag = NULL;
+  StopWatchInterface *stopWatch = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
+  vector<cudaStream_t> stream(numGPUs);
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  if (!sdkCreateTimer(&stopWatch)) {
+    printf("Failed to create stop watch\n");
+    exit(EXIT_FAILURE);
+  }
+  sdkStartTimer(&stopWatch);
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
+    cudaMalloc(&buffers[d], sizeof(int) * numElems);
+    cudaMemset(buffers[d], 0, sizeof(int) * numElems);
+    cudaMalloc(&buffersD2D[d], sizeof(int) * numElems);
+    cudaMemset(buffersD2D[d], 0, sizeof(int) * numElems);
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+  }
+
+  vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
+  vector<double> cpuLatencyMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaSetDevice(i);
+          cudaCheckError();
+        }
+      }
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      delay<<<1, 1, 0, stream[i]>>>(flag);
+      cudaCheckError();
+      cudaEventRecord(start[i], stream[i]);
+
+      sdkResetTimer(&stopWatch);
+      if (i == j) {
+        // Perform intra-GPU, D2D copies
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
+                       access, stream[i]);
+      } else {
+        if (p2p_method == P2P_WRITE) {
+          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
+                         stream[i]);
+        } else {
+          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
+                         stream[i]);
+        }
+      }
+      float cpu_time_ms = sdkGetTimerValue(&stopWatch);
+
+      cudaEventRecord(stop[i], stream[i]);
+      // Now that the work has been queued up, release the stream
+      *flag = 1;
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      float gpu_time_ms;
+      cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);
+
+      gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
+      cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
+      if (p2p && access) {
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+        cudaSetDevice(i);
+        cudaCheckError();
+      }
+    }
+  }
+
+  printf("   GPU");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  printf("\n   CPU");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream[d]);
+    cudaCheckError();
+  }
+
+  sdkDeleteTimer(&stopWatch);
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+int main(int argc, char **argv) {
+  int numGPUs, numElems = 40000000;
+  P2PDataTransfer p2p_method = P2P_WRITE;
+
+  cudaGetDeviceCount(&numGPUs);
+  cudaCheckError();
+
+  // process command line args
+  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+    printHelp();
+    return 0;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {
+    p2p_method = P2P_READ;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {
+    p2p_mechanism = SM;
+  }
+
+  // number of elements of int to be used in copy.
+  if (checkCmdLineFlag(argc, (const char **)argv, "numElems")) {
+    numElems = getCmdLineArgumentInt(argc, (const char **)argv, "numElems");
+  }
+
+  printf("[%s]\n", sSampleName);
+
+  // output devices
+  for (int i = 0; i < numGPUs; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    cudaCheckError();
+    printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,
+           prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
+  }
+
+  checkP2Paccess(numGPUs);
+
+  // Check peer-to-peer connectivity
+  printf("P2P Connectivity Matrix\n");
+  printf("     D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d", j);
+  }
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d\t", i);
+    for (int j = 0; j < numGPUs; j++) {
+      if (i != j) {
+        int access;
+        cudaDeviceCanAccessPeer(&access, i, j);
+        cudaCheckError();
+        printf("%6d", (access) ? 1 : 0);
+      } else {
+        printf("%6d", 1);
+      }
+    }
+    printf("\n");
+  }
+
+  printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
+  outputBandwidthMatrix(numElems, numGPUs, false, P2P_WRITE);
+  printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
+  outputBandwidthMatrix(numElems, numGPUs, true, P2P_WRITE);
+  if (p2p_method == P2P_READ) {
+    printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
+    outputBandwidthMatrix(numElems, numGPUs, true, p2p_method);
+  }
+  printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
+  outputBidirectionalBandwidthMatrix(numElems, numGPUs, false);
+  printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
+  outputBidirectionalBandwidthMatrix(numElems, numGPUs, true);
+
+  printf("P2P=Disabled Latency Matrix (us)\n");
+  outputLatencyMatrix(numGPUs, false, P2P_WRITE);
+  printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
+  outputLatencyMatrix(numGPUs, true, P2P_WRITE);
+  if (p2p_method == P2P_READ) {
+    printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
+    outputLatencyMatrix(numGPUs, true, p2p_method);
+  }
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+
+  exit(EXIT_SUCCESS);
+}

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.o


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi


+ 72 - 134
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi.cu

@@ -34,36 +34,10 @@
 #include <sstream>
 
 #include <omp.h>
-
-#ifdef HAVE_CUB
-#include <cub/block/block_reduce.cuh>
-#endif  // HAVE_CUB
-
-#ifdef USE_NVTX
 #include <nvToolsExt.h>
 
-const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
-                           0x0000ffff, 0x00ff0000, 0x00ffffff};
-const int num_colors = sizeof(colors) / sizeof(uint32_t);
-
-#define PUSH_RANGE(name, cid)                              \
-    {                                                      \
-        int color_id = cid;                                \
-        color_id = color_id % num_colors;                  \
-        nvtxEventAttributes_t eventAttrib = {0};           \
-        eventAttrib.version = NVTX_VERSION;                \
-        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
-        eventAttrib.colorType = NVTX_COLOR_ARGB;           \
-        eventAttrib.color = colors[color_id];              \
-        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
-        eventAttrib.message.ascii = name;                  \
-        nvtxRangePushEx(&eventAttrib);                     \
-    }
-#define POP_RANGE nvtxRangePop();
-#else
-#define PUSH_RANGE(name, cid)
-#define POP_RANGE
-#endif
+#define BLOCK_DIM_X 32
+#define BLOCK_DIM_Y 32
 
 #define CUDA_RT_CALL(call)                                                                  \
     {                                                                                       \
@@ -76,16 +50,14 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
                     #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
     }
 
-typedef float real;
-constexpr real tol = 1.0e-8;
+constexpr float tol = 1.0e-8;
 
-const real PI = 2.0 * std::asin(1.0);
+const float PI = 2.0 * std::asin(1.0);
 
-__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
-                                      const real pi, const int offset, const int nx,
-                                      const int my_ny, const int ny) {
+__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
+					const int nx, const int my_ny, const int ny) {
     for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
-        const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
+        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
         a[iy * nx + 0] = y0;
         a[iy * nx + (nx - 1)] = y0;
         a_new[iy * nx + 0] = y0;
@@ -93,41 +65,40 @@ __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __re
     }
 }
 
-template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
-                              real* __restrict__ const l2_norm, const int iy_start,
-                              const int iy_end, const int nx, const bool calculate_norm) {
-#ifdef HAVE_CUB
-    typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-        BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-#endif  // HAVE_CUB
+__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
+                              const int iy_end, const int nx) {
     int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
     int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
-    real local_l2_norm = 0.0;
+    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
+    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
 
     if (iy < iy_end && ix < (nx - 1)) {
-        const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
+	// Update grid point
+        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
                                      a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
         a_new[iy * nx + ix] = new_val;
-        if (calculate_norm) {
-            real residue = new_val - a[iy * nx + ix];
-            local_l2_norm += residue * residue;
-        }
+	float residue = new_val - a[iy * nx + ix];
+	// Set block-level L2 norm value for this grid point
+	block_l2_sum[thread_index] = residue * residue;
+    }
+    else {
+	block_l2_sum[thread_index] = 0;
     }
-    if (calculate_norm) {
-#ifdef HAVE_CUB
-        real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
-        if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
-#else
-        atomicAdd(l2_norm, local_l2_norm);
-#endif  // HAVE_CUB
+    // Reduce L2 norm for the block in parallel
+    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
+	__syncthreads();
+	if ((thread_index) % (2*stride) == 0) {
+    	    block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
+	}
+    }
+    // Atomically update global L2 norm with block-reduced L2 norm
+    if (thread_index == 0) {
+	atomicAdd(l2_norm, block_l2_sum[0]);
     }
 }
 
-template <typename T>
-T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
-    T argval = default_val;
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
+    int argval = default_val;
     char** itr = std::find(begin, end, arg);
     if (itr != end && ++itr != end) {
         std::istringstream inbuf(*itr);
@@ -136,125 +107,92 @@ T get_argval(char** begin, char** end, const std::string& arg, const T default_v
     return argval;
 }
 
-bool get_arg(char** begin, char** end, const std::string& arg) {
-    char** itr = std::find(begin, end, arg);
-    if (itr != end) {
-        return true;
-    }
-    return false;
-}
-
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
-                  const int nccheck, const bool print);
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
 
 int main(int argc, char* argv[]) {
-    const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
-    const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
-    const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
-    const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
-    const bool csv = get_arg(argv, argv + argc, "-csv");
-
-    if (nccheck != 1) {
-        fprintf(stderr, "Only nccheck = 1 is supported\n");
-        return -1;
-    }
+    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
+    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
+    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
 
     CUDA_RT_CALL(cudaSetDevice(0));
     CUDA_RT_CALL(cudaFree(0));
 
-    real* a_ref_h;
-    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
+    float* a_ref_h;
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
     
-    double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv);
+    double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
 
-    if (csv) {
-        printf("single_gpu, %d, %d, %d, %d, %f\n", nx, ny, iter_max, nccheck, runtime_serial);
-    } else {
-        printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, runtime_serial);
-    }
+    printf("%dx%d: 1 GPU: %8.4f s\n", ny, nx, runtime_serial);
 
     return 0;
 }
 
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
-                  const int nccheck, const bool print) {
-    real* a;
-    real* a_new;
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
+    float* a;
+    float* a_new;
 
-    real* l2_norm_d;
-    real* l2_norm_h;
+    float* l2_norm_d;
+    float* l2_norm_h;
 
     int iy_start = 1;
     int iy_end = (ny - 1);
 
-    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
 
-    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
-    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
+    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
+    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
 
     // Set diriclet boundary conditions on left and right boarder
+    nvtxRangePush("Init boundaries");
     initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
     CUDA_RT_CALL(cudaGetLastError());
     CUDA_RT_CALL(cudaDeviceSynchronize());
+    nvtxRangePop();
 
-    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
-    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
+    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
 
     CUDA_RT_CALL(cudaDeviceSynchronize());
 
-    if (print)
-        printf(
-            "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
-            "norm "
-            "check every %d iterations\n",
-            iter_max, ny, nx, nccheck);
+    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, ny, nx);
 
-    constexpr int dim_block_x = 32;
-    constexpr int dim_block_y = 32;
-    dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
+    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
+    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
 
     int iter = 0;
-    bool calculate_norm;
-    real l2_norm = 1.0;
+    float l2_norm = 1.0;
 
     double start = omp_get_wtime();
-    PUSH_RANGE("Jacobi solve", 0)
+    nvtxRangePush("Jacobi Solve");
     while (l2_norm > tol && iter < iter_max) {
-        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(real)));
-
-        calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
-        jacobi_kernel<dim_block_x, dim_block_y>
-            <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, 0>>>(
-                a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
-        CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
 
-        if (calculate_norm) {
-            CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost));
-        }
+	// Compute grid points for this iteration
+        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
+       	CUDA_RT_CALL(cudaGetLastError());
+        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
 
         // Apply periodic boundary conditions
 
-        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
+        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
                                      cudaMemcpyDeviceToDevice));
-        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
+        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
                                      cudaMemcpyDeviceToDevice));
 
-        if (calculate_norm) {
-	    CUDA_RT_CALL(cudaDeviceSynchronize());
-            //CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
-            l2_norm = *l2_norm_h;
-            l2_norm = std::sqrt(l2_norm);
-            if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
-        }
+	CUDA_RT_CALL(cudaDeviceSynchronize());
+	l2_norm = *l2_norm_h;
+	l2_norm = std::sqrt(l2_norm);
 
-        std::swap(a_new, a);
         iter++;
+	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
+
+        std::swap(a_new, a);
     }
-    POP_RANGE
+    nvtxRangePop();
     double stop = omp_get_wtime();
 
-    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
+    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
 
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));

BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.qdrep


BIN
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/jacobi_report.sqlite


+ 13 - 0
hpc/multi_gpu_nways/labs/CFD/English/C/source_code/single_gpu/temp

@@ -0,0 +1,13 @@
+Single GPU jacobi relaxation: 100 iterations on 8192 x 8192 mesh
+    0, 22.626005
+   10, 3.374940
+   20, 2.069380
+   30, 1.542849
+   40, 1.250118
+   50, 1.060773
+   60, 0.927187
+   70, 0.827260
+   80, 0.749264
+   90, 0.686587
+8192x8192: 1 GPU:  16.0760 s
+

+ 9 - 14
hpc/multi_gpu_nways/labs/CFD/English/introduction.ipynb

@@ -16,7 +16,7 @@
     "* Profiling the application using Nsight Systems and HPCToolkit\n",
     "* Applying optimizations like CUDA streams and overlapping compute and communication\n",
     "* Understanding GPUDirect technologies like P2P and RDMA\n",
-    "* Utilizing NVIDIA NCCL and NVSHMEM libraries"
+    "* Learning and using NVIDIA NCCL and NVSHMEM libraries"
    ]
   },
   {
@@ -38,21 +38,16 @@
     "\n",
     "We will take up the Jacobi Solver, an iterative technique for solving system of linear equations, in this tutorial. To begin, click on the first link below:\n",
     "\n",
-    "1. Overview of Jacobi Solver application\n",
-    "    * Review of single-GPU code\n",
-    "    * Parallelizing to multiple GPUs using cudaMemcpy\n",
-    "2. Profiling with NVTX and Nsight Systems\n",
-    "    * Profiling multi-GPU cudaMemcpy code\n",
-    "    * Using single-node CUDA-aware MPI\n",
-    "    * Optimizing CUDA-aware MPI with compute-copy overlap\n",
-    "3. Communication topology\n",
-    "    * Overview of intra-node and inter-node communication architecture\n",
-    "    * Benchmarking communication networks\n",
-    "4. Profiling with HPCToolkit\n",
-    "    * Analysis of GPUDirect P2P with single-node MPI\n",
-    "    * Analysis of GPUDirect RDMA with multi-node MPI\n",
+    "1. Single Node:\n",
+    "    * [Overview of single-GPU code and Nsight Systems Profiler](C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb)\n",
+    "    * [Multi-GPU using CUDA streams](C/jupyter_notebook/memcpy/streams.ipynb)\n",
+    "    * Multi-GPU using normal and CUDA-aware MPI\n",
+    "2. Multi Node:\n",
+    "    * CUDA-aware MPI and introduction to HPCToolkit\n",
+    "    * Optimizations: computation-communication overlap\n",
     "5. NCCL Library \n",
     "6. NVHSMEM Library\n",
+    "7. Final remarks\n",
     "--- \n",
     "\n",
     "## Licensing \n",

+ 0 - 57
hpc/multi_gpu_nways/slurm-165592.out

@@ -1,57 +0,0 @@
-[I 21:04:51.121 NotebookApp] Authentication of /metrics is OFF, since other authentication is disabled.
-[W 21:04:51.412 NotebookApp] All authentication is disabled.  Anyone who can connect to this server will be able to run code.
-[I 21:04:51.414 NotebookApp] Serving notebooks from local directory: /home/anisaxena/multi_gpu_labs
-[I 21:04:51.414 NotebookApp] Jupyter Notebook 6.4.0 is running at:
-[I 21:04:51.414 NotebookApp] http://prm-dgx-05:8000/
-[I 21:04:51.414 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
-[I 21:05:36.430 NotebookApp] 302 GET / (10.33.12.57) 0.550000ms
-[W 21:05:53.513 NotebookApp] Notebook CFD/English/C/jupyter_notebook/jacobi/overview.ipynb is not trusted
-[I 21:05:54.029 NotebookApp] Kernel started: b7548e3a-38bd-48ee-9154-5e04bbfe6191, name: python3
-[I 21:05:56.940 NotebookApp] Kernel started: 205d247e-f2c4-466c-aa39-18b8184a3efa, name: python3
-[I 21:33:57.393 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 21:35:57.453 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 22:43:58.717 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 22:47:58.737 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:03:59.255 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:05:59.271 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:07:59.302 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:09:59.293 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:11:59.307 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:13:59.319 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:15:59.664 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:17:59.243 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:27:59.313 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:29:59.517 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:35:59.968 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:56:00.824 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 23:58:00.708 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:06:00.988 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:08:00.914 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:10:00.922 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:20:01.491 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:20:47.948 NotebookApp] New terminal with automatic name: 1
-[I 00:20:59.004 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:26:01.841 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:28:02.156 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:34:02.906 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:36:02.787 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:38:02.942 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:40:03.096 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:44:03.083 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 00:46:03.095 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:14:05.872 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:32:06.461 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:34:06.464 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:36:06.368 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:38:06.487 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:46:07.347 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:50:07.076 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 02:51:11.220 NotebookApp] Kernel interrupted: b7548e3a-38bd-48ee-9154-5e04bbfe6191
-[I 02:52:06.780 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:00:07.351 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:02:07.293 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:04:07.372 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:08:07.298 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:12:07.779 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:14:08.093 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb
-[I 03:16:07.857 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/jacobi/overview.ipynb

+ 72 - 0
hpc/multi_gpu_nways/slurm-171483.out

@@ -0,0 +1,72 @@
+[I 07:27:57.755 NotebookApp] Authentication of /metrics is OFF, since other authentication is disabled.
+[W 07:27:58.046 NotebookApp] All authentication is disabled.  Anyone who can connect to this server will be able to run code.
+[I 07:27:58.048 NotebookApp] Serving notebooks from local directory: /home/anisaxena/multi_gpu_labs
+[I 07:27:58.048 NotebookApp] Jupyter Notebook 6.4.0 is running at:
+[I 07:27:58.048 NotebookApp] http://prm-dgx-02:8000/
+[I 07:27:58.048 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
+[W 07:28:14.978 NotebookApp] Notebook CFD/English/C/jupyter_notebook/memcpy/streams.ipynb is not trusted
+[I 07:28:15.508 NotebookApp] 302 GET /notebooks/CFD/English/C/images/domain_decomposition.png (10.33.12.57) 2.790000ms
+[I 07:28:15.510 NotebookApp] 302 GET /notebooks/CFD/English/C/images/halo_exchange.png (10.33.12.57) 4.010000ms
+[I 07:28:15.792 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nvidia_smi_p2p_gpu0.png (10.33.12.57) 1.610000ms
+[I 07:28:16.049 NotebookApp] 302 GET /notebooks/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png (10.33.12.57) 2.300000ms
+[I 07:28:16.050 NotebookApp] 302 GET /notebooks/CFD/English/C/images/open_terminal_session.png (10.33.12.57) 3.360000ms
+[I 07:28:16.052 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nvidia_smi_topo_output.png (10.33.12.57) 4.480000ms
+[I 07:28:16.054 NotebookApp] 302 GET /notebooks/CFD/English/C/images/intra_node_topology_map.png (10.33.12.57) 1.630000ms
+[I 07:28:16.243 NotebookApp] Kernel started: 9a33717c-ba28-4b4b-8c87-b530192bd6ac, name: python3
+[I 07:28:18.294 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_serialized.png (10.33.12.57) 1.630000ms
+[I 07:28:18.298 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_host_staging.png (10.33.12.57) 2.040000ms
+[I 07:28:18.299 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpyasync_parallel.png (10.33.12.57) 3.020000ms
+[I 07:28:18.303 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_p2p_overview.png (10.33.12.57) 1.690000ms
+[I 07:28:18.593 NotebookApp] 302 GET /notebooks/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png (10.33.12.57) 1.820000ms
+[I 10:10:44.123 NotebookApp] Starting buffering for 9a33717c-ba28-4b4b-8c87-b530192bd6ac:9ce4d32b6afc44b2a01b996acfd6b876
+[I 10:10:49.409 NotebookApp] 302 GET /notebooks/CFD/English/C/images/gpu_programming_process.png (10.33.12.57) 1.900000ms
+[I 10:10:49.597 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nsys_cli_sample_output.png (10.33.12.57) 1.840000ms
+[I 10:10:49.598 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nsys_overview.png (10.33.12.57) 3.060000ms
+[I 10:10:49.669 NotebookApp] Kernel started: 353ea053-bd11-497d-984d-3acf0b5fa9d9, name: python3
+[I 10:10:49.832 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nsys_single_gpu_analysis.png (10.33.12.57) 1.820000ms
+[W 10:10:55.797 NotebookApp] Notebook CFD/English/C/jupyter_notebook/memcpy/streams.ipynb is not trusted
+[I 10:10:56.047 NotebookApp] Kernel started: 693bd71d-d7bd-4ab7-a106-785f47b6815a, name: python3
+[I 10:10:56.294 NotebookApp] 302 GET /notebooks/CFD/English/C/images/domain_decomposition.png (10.33.12.57) 0.880000ms
+[I 10:10:56.298 NotebookApp] 302 GET /notebooks/CFD/English/C/images/halo_exchange.png (10.33.12.57) 0.960000ms
+[I 10:10:56.453 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nvidia_smi_p2p_gpu0.png (10.33.12.57) 0.980000ms
+[I 10:10:56.602 NotebookApp] 302 GET /notebooks/CFD/English/C/images/open_terminal_session.png (10.33.12.57) 1.370000ms
+[I 10:10:56.603 NotebookApp] 302 GET /notebooks/CFD/English/C/images/nvidia_smi_topo_output.png (10.33.12.57) 1.990000ms
+[I 10:10:56.604 NotebookApp] 302 GET /notebooks/CFD/English/C/images/intra_node_topology_map.png (10.33.12.57) 1.260000ms
+[I 10:10:56.606 NotebookApp] 302 GET /notebooks/CFD/English/C/images/dgx1_8x_tesla_v100_topo.png (10.33.12.57) 2.130000ms
+[I 10:11:00.072 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_serialized.png (10.33.12.57) 1.100000ms
+[I 10:11:00.076 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpyasync_parallel.png (10.33.12.57) 0.960000ms
+[I 10:11:01.089 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_p2p_overview.png (10.33.12.57) 1.770000ms
+[I 10:11:01.090 NotebookApp] 302 GET /notebooks/CFD/English/C/images/memcpy_host_staging.png (10.33.12.57) 2.430000ms
+[I 10:11:06.177 NotebookApp] 302 GET /notebooks/CFD/English/C/images/p2p_2_gpu_memcpy_nsys.png (10.33.12.57) 1.280000ms
+[I 10:12:57.183 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:14:05.943 NotebookApp] New terminal with automatic name: 1
+[I 10:14:57.651 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:18:58.217 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:20:58.794 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:31:00.416 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:33:00.633 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:37:00.660 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:41:00.683 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:43:00.637 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:45:00.647 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:47:01.029 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:53:01.992 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:55:02.004 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 10:59:02.238 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:01:02.352 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:03:02.263 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:05:02.282 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:07:02.488 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:09:03.016 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:11:03.133 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:13:02.939 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:15:02.546 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:17:02.454 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:19:02.570 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:21:02.281 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:23:02.287 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:24:32.755 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:24:36.811 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/single_gpu/single_gpu_overview.ipynb
+[I 11:24:38.215 NotebookApp] Saving file at /CFD/English/introduction.ipynb
+[I 11:27:02.398 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb
+[I 11:27:46.550 NotebookApp] Saving file at /CFD/English/C/jupyter_notebook/memcpy/streams.ipynb