Mozhgan K. Chimeh 4 anni fa
parent
commit
1b8d4e9727
22 ha cambiato i file con 130 aggiunte e 61 eliminazioni
  1. 2 2
      .gitignore
  2. 6 4
      hpc/nways/Dockerfile
  3. 8 4
      hpc/nways/Singularity
  4. 1 1
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/cudac/nways_cuda.ipynb
  5. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openacc_multicore_feedback.png
  6. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_feedback.png
  7. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_feedback_multicore.png
  8. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_gpu.png
  9. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_gpu_collapse.png
  10. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_multicore.png
  11. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/stdpar_gpu.png
  12. BIN
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/stdpar_multicore.png
  13. 20 8
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openacc/nways_openacc.ipynb
  14. 63 18
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openmp/nways_openmp.ipynb
  15. 0 7
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/serial/rdf_overview.ipynb
  16. 5 7
      hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/stdpar/nways_stdpar.ipynb
  17. 9 0
      hpc/nways/nways_labs/nways_MD/English/C/source_code/dataset.py
  18. 4 0
      hpc/nways/nways_labs/nways_MD/English/C/source_code/input/.gitignore
  19. 0 1
      hpc/nways/nways_labs/nways_MD/English/C/source_code/openacc/Makefile
  20. 1 1
      hpc/nways/nways_labs/nways_MD/English/C/source_code/serial/Makefile
  21. 5 4
      hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb
  22. 6 4
      hpc/nways/nways_labs/nways_start.ipynb

+ 2 - 2
.gitignore

@@ -1,4 +1,4 @@
 .ipynb_checkpoints
 */.ipynb_checkpoints/*
-**/input/
-input/
+alk.traj.dcd
+*.simg

+ 6 - 4
hpc/nways/Dockerfile

@@ -4,10 +4,10 @@
 # To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 openacc-labs:latest
 # Finally, open http://localhost:8888/
 
-FROM nvcr.io/nvidia/nvhpc:20.9-devel-ubuntu20.04
+FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 
 RUN apt-get -y update && \
-        DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential && \
+        DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
         rm -rf /var/lib/apt/lists/* && \
         pip3 install --no-cache-dir jupyter &&\
         mkdir -p /home/openacc/labs
@@ -26,9 +26,11 @@ RUN apt-get update -y && \
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
 
+RUN python3 /labs/nways_MD/English/C/source_code/dataset.py
+
 #################################################
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/"
-ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include:$PATH"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64/"
+ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include:$PATH"
 #################################################
 
 ADD nways_labs/ /labs

+ 8 - 4
hpc/nways/Singularity

@@ -1,13 +1,13 @@
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 
 Bootstrap: docker
-FROM: nvcr.io/nvidia/nvhpc:20.9-devel-ubuntu20.04
+FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 
 %environment
     export XDG_RUNTIME_DIR=
     export PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin"
     export PATH=/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/lib64/"
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64/"
 
 %post
     build_tmp=$(mktemp -d) && cd ${build_tmp}
@@ -18,14 +18,18 @@ FROM: nvcr.io/nvidia/nvhpc:20.9-devel-ubuntu20.04
 	    m4 vim-nox emacs-nox nano zip\
  	    python3-pip python3-setuptools git-core inotify-tools \
 	    curl git-lfs \
-	    build-essential
+	    build-essential libtbb-dev
     rm -rf /var/lib/apt/cache/* 
 
     pip3 install --upgrade pip
     pip3 install --no-cache-dir jupyter
+    pip3 install gdown
 
     apt-get install --no-install-recommends -y build-essential 
 
+    python3 /labs/nways_MD/English/C/source_code/dataset.py
+
+
 # NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
     apt-get update -y   
     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget
@@ -38,7 +42,7 @@ FROM: nvcr.io/nvidia/nvhpc:20.9-devel-ubuntu20.04
     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
     bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3 
     rm Miniconda3-latest-Linux-x86_64.sh 
-
+    
     cd /
     rm -rf ${build_tmp}
 

+ 1 - 1
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/cudac/nways_cuda.ipynb

@@ -248,7 +248,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU\n",
-    "!cd ../../source_code/cudac && nvcc -O3 -w -ldl -o rdf rdf.cu -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include"
+    "!cd ../../source_code/cudac && nvcc -o rdf rdf.cu"
    ]
   },
   {

BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openacc_multicore_feedback.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_feedback.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_feedback_multicore.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_gpu.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_gpu_collapse.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/openmp_multicore.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/stdpar_gpu.png


BIN
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/images/stdpar_multicore.png


+ 20 - 8
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openacc/nways_openacc.ipynb

@@ -150,14 +150,20 @@
    "outputs": [],
    "source": [
     "#Compile the code for multicore\n",
-    "!cd ../../source_code/openacc && nvc++ -acc -O3 -w -ta=multicore -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include"
+    "!cd ../../source_code/openacc && nvc++ -acc -ta=multicore -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's run the executable and validate the output first. "
+    "Let's inspect part of the compiler feedback and see what it's telling us (your compiler feedback will be similar to the below screenshot).\n",
+    "\n",
+    "<img src=\"../images/openacc_multicore_feedback.png\">\n",
+    "\n",
+    "You can see from *Line 177*, it is generating a multicore code `177, Generating Multicore code`. It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it. \n",
+    "\n",
+    "Let's run the executable and validate the output first. Then, profile the code."
    ]
   },
   {
@@ -229,7 +235,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU\n",
-    "!cd ../../source_code/openacc && nvc++ -acc -O3 -w -ldl -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include "
+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp"
    ]
   },
   {
@@ -357,7 +363,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU\n",
-    "!cd ../../source_code/openacc && nvc++ -acc -O3 -w -ldl -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include "
+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp"
    ]
   },
   {
@@ -419,7 +425,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU\n",
-    "!cd ../../source_code/openacc && nvc++ -acc -O3 -w -ldl -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include "
+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo -Minfo=accel -o rdf rdf.cpp"
    ]
   },
   {
@@ -544,7 +550,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU without managed memory\n",
-    "!cd ../../source_code/openacc && make clean && make"
+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla,lineinfo -Minfo=accel -o rdf rdf.cpp"
    ]
   },
   {
@@ -632,15 +638,21 @@
    "metadata": {},
    "source": [
     "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip). Let us now go back to parallelizing our code using other approaches.\n",
-    "\n",
+    "<!--\n",
     "**IMPORTANT**: If you would like to continue and optimize this application further with OpenACC, please click on the **NEXT** button, otherwise click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
+    "-->\n",
+    "\n",
+    "**IMPORTANT**: Please click on the **NEXT** button to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
     "\n",
     "-----\n",
     "\n",
-    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=nways_openacc_opt.ipynb>NEXT</a></span> </p>\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
     "\n",
     "-----\n",
     "\n",
+    "<!-- <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=nways_openacc_opt.ipynb>NEXT</a></span> </p>\n",
+    "-->\n",
+    "\n",
     "\n",
     "# Links and Resources\n",
     "[OpenACC API guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",

File diff suppressed because it is too large
+ 63 - 18
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openmp/nways_openmp.ipynb


+ 0 - 7
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/serial/rdf_overview.ipynb

@@ -103,13 +103,6 @@
     "\n",
     "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

+ 5 - 7
hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/stdpar/nways_stdpar.ipynb

@@ -134,8 +134,7 @@
     "\n",
     "`-stdpar` : This flag tell the compiler to enable Parallel STL for a respective target\n",
     "- `stdpar=multicore` will allow us to compile our code for a multicore\n",
-    "- `stdpar` will allow us to compile our code for a NVIDIA GPU (Default is NVIDIA)\n",
-    "          "
+    "- `stdpar` will allow us to compile our code for a NVIDIA GPU (Default is NVIDIA)"
    ]
   },
   {
@@ -145,7 +144,7 @@
    "outputs": [],
    "source": [
     "#Compile the code for muticore\n",
-    "! $NVCPPPATH -stdpar=multicore -std=c++11 -DUSE_COUNTING_ITERATOR -I/pgi/Linux_x86_64/cuda/10.2/include -L/pgi/Linux_x86_64/cuda/10.2/lib64 -lnvToolsExt rdf.cpp"
+    "!cd ../../source_code/stdpar && nvc++ -std=c++17 -stdpar=multicore -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -ltbb"
    ]
   },
   {
@@ -193,7 +192,7 @@
    "source": [
     "Let's checkout the profiler's report. [Download the profiler output](../../source_code/stdpar/rdf_stdpar_multicore.qdrep) and open it via the GUI. Have a look at the example expected profiler report below:\n",
     "\n",
-    "<img src=\"../images/.png\">\n",
+    "<img src=\"../images/stdpar_multicore.png\">\n",
     "\n",
     "\n",
     "### Compile and run for Nvidia GPU\n",
@@ -217,7 +216,7 @@
    "outputs": [],
    "source": [
     "#compile for Tesla GPU\n",
-    "!cd ../../source_code/stdpar && nvc++ -O3 -w -ldl -stdpar -std=c++11 -DUSE_COUNTING_ITERATOR -o rdf rdf.cpp "
+    "!cd ../../source_code/stdpar && nvc++ -std=c++17 -stdpar=gpu -o rdf rdf.cpp "
    ]
   },
   {
@@ -269,8 +268,7 @@
     "\n",
     "If you inspect the output of the profiler closer, you can see the usage of *Unified Memory* annotated with green rectangle which was explained in previous sections.\n",
     "\n",
-    "\n",
-    "<img src=\"../images/stdpar_um.png\">\n",
+    "Moreover, if you compare the NVTX marker `Pair_Calculation` (from the NVTX row) in both multicore and GPU version, you can see how much improvement you achieved. In the *example screenshot*, we were able to reduce that range from 1.52 seconds to 225.8 mseconds.\n",
     "\n",
     "Feel free to checkout the [solution](../../source_code/stdpar/SOLUTION/rdf.cpp) to help you understand better or compare your implementation with the sample solution."
    ]

+ 9 - 0
hpc/nways/nways_labs/nways_MD/English/C/source_code/dataset.py

@@ -0,0 +1,9 @@
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
+
+import gdown
+import os
+
+## alk.traj.dcd input file 
+url = 'https://drive.google.com/uc?id=1WZ0rtXZ-uMLfy7htT0gaU4EQ_Rq61QTF&export=download'
+output = '/labs/nways_MD/English/C/source_code/input/alk.traj.dcd'
+gdown.download(url, output, quiet=False,proxy=None)

+ 4 - 0
hpc/nways/nways_labs/nways_MD/English/C/source_code/input/.gitignore

@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore

+ 0 - 1
hpc/nways/nways_labs/nways_MD/English/C/source_code/openacc/Makefile

@@ -1,7 +1,6 @@
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 
 CC := nvc++
-CFLAGS := -O3 -w 
 ACCFLAGS := -ta=tesla:managed,lineinfo -Minfo=accel
 
 rdf: rdf.cpp

+ 1 - 1
hpc/nways/nways_labs/nways_MD/English/C/source_code/serial/Makefile

@@ -3,7 +3,7 @@
 CC := nvc++
 CFLAGS := -O3 -w -ldl
 ACCFLAGS := -Minfo=accel
-NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.9/cuda/11.0/include
+NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include
 
 rdf: rdf.cpp
 	${CC} ${CFLAGS} ${ACCFLAGS} -o rdf rdf.cpp ${NVTXLIB} 

+ 5 - 4
hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb

@@ -29,7 +29,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**IMPORTANT**: Before we start please download the input file needed for this application from the [Google drive](https://drive.google.com/drive/folders/1aQ_MFyrjBIDMhCczse0S2GQ36MlR6Q_s?usp=sharing) and upload it to the input folder. From the top menu, click on *File*, and *Open* and navigate to `C/source_code/input` directory and copy paste the downloaded input file (`alk.traj.dcd`).\n",
+    "<!--**IMPORTANT**: Before we start please download the input file needed for this application from the [Google drive](https://drive.google.com/drive/folders/1aQ_MFyrjBIDMhCczse0S2GQ36MlR6Q_s?usp=sharing) and upload it to the input folder. From the top menu, click on *File*, and *Open* and navigate to `C/source_code/input` directory and copy paste the downloaded input file (`alk.traj.dcd`).-->\n",
     "\n",
     "\n",
     "### Tutorial Outline\n",
@@ -44,8 +44,8 @@
     "    - How to use NVTX APIs\n",
     "    - Introduction to Nsight Compute\n",
     "    - Optimization Steps to parallel programming \n",
-    "1. [OpenACC](C/jupyter_notebook/openacc/nways_openacc.ipynb) , [OpenACC Advanced](C/jupyter_notebook/openacc/nways_openacc_opt.ipynb)\n",
-    "2. [Kokkos](C/jupyter_notebook/kokkos/nways_kokkos.ipynb)\n",
+    "1. [OpenACC](C/jupyter_notebook/openacc/nways_openacc.ipynb)<!-- , [OpenACC Advanced](C/jupyter_notebook/openacc/nways_openacc_opt.ipynb)-->\n",
+    "<!--2. [Kokkos](C/jupyter_notebook/kokkos/nways_kokkos.ipynb)-->\n",
     "3. [stdpar](C/jupyter_notebook/stdpar/nways_stdpar.ipynb)\n",
     "4. [OpenMP](C/jupyter_notebook/openmp/nways_openmp.ipynb) \n",
     "5. [CUDA C](C/jupyter_notebook/cudac/nways_cuda.ipynb) "
@@ -64,8 +64,9 @@
     "Beginner, Intermediate\n",
     "\n",
     "### Target Audience and Prerequisites\n",
-    "The target audience for this lab is ..\n",
+    "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
     "\n",
+    "Basic experience with C/C++ programming is needed. No GPU programming knowledge is required.\n",
     "\n",
     "-----\n",
     "\n",

+ 6 - 4
hpc/nways/nways_labs/nways_start.ipynb

@@ -7,8 +7,8 @@
     "## N Ways to GPU Programming\n",
     "\n",
     "## Learning objectives\n",
-    "With the release of CUDA in 2007, different approaches to programming GPUs have evolved. Each approach has its own advantages and disadvantages. By the end of this bootcamp session, students will have a broader perspective on GPU programming approaches to help them select a programming model that better fits their applications' needs and constraints. The bootcamp will teach how to accelerate a popular algorithm of Radial Distribution Function (RDF) using the following methods:\n",
-    "* C++ Standard: pSTL\n",
+    "With the release of CUDA in 2007, different approaches to programming GPUs have evolved. Each approach has its own advantages and disadvantages. By the end of this bootcamp session, students will have a broader perspective on GPU programming approaches to help them select a programming model that better fits their applications' needs and constraints. The bootcamp will teach how to accelerate a real world scientific application  using the following methods:\n",
+    "* C++ Standard: stdpar\n",
     "* Directives: OpenACC, OpenMP\n",
     "* Frameworks: Kokkos\n",
     "* Lower level C Construct: CUDA C\n",
@@ -41,13 +41,15 @@
    "metadata": {},
    "source": [
     "### Tutorial Duration\n",
-    "The lab material will be presented in a 4hr session. Link to material is available for download at the end of the lab.\n",
+    "The lab material will be presented in a 8hr session. Link to material is available for download at the end of the lab.\n",
     "\n",
     "### Content Level\n",
     "Beginner, Intermediate\n",
     "\n",
     "### Target Audience and Prerequisites\n",
-    "The target audience for this lab is ..\n",
+    "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
+    "\n",
+    "Basic experience with C/C++ programming is needed. No GPU programming knowledge is required. \n",
     "\n",
     "--- \n",
     "\n",