5 lat temu · 37048eadbd
--- a/hpc/nways/Dockerfile
+++ b/hpc/nways/Dockerfile
@@ -1,10 +1,11 @@
 
				 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				 
			
 
				-# To build the docker container, run: $ sudo docker build -t openacc-labs:latest .
			
 
				-# To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 openacc-labs:latest
			
 
				+# To build the docker container, run: $ sudo docker build -t nways-labs:latest .
			
 
				+# To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways-labs:latest
			
 
				 # Finally, open http://localhost:8888/
			
 
				 
			
 
				-FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+#FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+FROM nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
			
 
				 
			
 
				 RUN apt-get -y update && \
			
 
				         DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
			
@@ -34,8 +35,8 @@ RUN python3 /labs/nways_MD/English/C/source_code/dataset.py
 
				 RUN python3 /labs/nways_MD/English/Fortran/source_code/dataset.py
			
 
				 
			
 
				 #################################################
			
 
				-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64/"
			
 
				-ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH"
			
 
				+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.3/lib64/"
			
 
				+ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH"
			
 
				 #################################################
			
 
				 
			
 
				 ADD nways_labs/ /labs
			
--- a/hpc/nways/Singularity
+++ b/hpc/nways/Singularity
@@ -1,13 +1,14 @@
 
				 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				 
			
 
				 Bootstrap: docker
			
 
				-FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+#FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+FROM: nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
			
 
				 
			
 
				 %environment
			
 
				     export XDG_RUNTIME_DIR=
			
 
				     export PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin"
			
 
				     export PATH=/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:$PATH
			
 
				-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64/"
			
 
				+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64/"
			
 
				 
			
 
				 %post
			
 
				     build_tmp=$(mktemp -d) && cd ${build_tmp}
			
@@ -28,6 +29,7 @@ FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 
				     apt-get install --no-install-recommends -y build-essential 
			
 
				 
			
 
				     python3 /labs/nways_MD/English/C/source_code/dataset.py
			
 
				+    python3 /labs/nways_MD/English/Fortran/source_code/dataset.py
			
 
				 
			
 
				 
			
 
				 # NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
			
@@ -52,4 +54,4 @@ FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
 
				     "$@"
			
 
				 
			
 
				 %labels
			
 
				-    AUTHOR mozhgank
			
 
				+    AUTHOR mozhgank
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/cudac/nways_cuda.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/cudac/nways_cuda.ipynb
@@ -418,7 +418,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.4"
			
 
				+   "version": "3.6.2"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openacc/nways_openacc.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openacc/nways_openacc.ipynb
@@ -150,7 +150,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#Compile the code for multicore\n",
			
 
				-    "!cd ../../source_code/openacc && nvc++ -acc -ta=multicore -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvc++ -acc -ta=multicore -Minfo=accel -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -202,7 +202,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Let's checkout the profiler's report. [Download the profiler output](../../source_code/openacc/rdf_multicore.qdrep) and open it via the GUI. From the timeline view, checkout the NVTX markers displays as part of threads. **Why are we using NVTX?** Please see the section on [Using NVIDIA Tools Extension (NVTX)](../profiling-c.ipynb#Using-NVIDIA-Tools-Extension-(NVTX)).\n",
			
 
				+    "Let's checkout the profiler's report. [Download the profiler output](../../source_code/openacc/rdf_multicore.qdrep) and open it via the GUI. From the timeline view, checkout the NVTX markers displays as part of threads. **Why are we using NVTX?** Please see the section on [Using NVIDIA Tools Extension (NVTX)](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb#Using-NVIDIA-Tools-Extension-(NVTX)).\n",
			
 
				     "\n",
			
 
				     "From the timeline view, right click on the nvtx row and click the \"show in events view\". Now you can see the nvtx statistic at the bottom of the window which shows the duration of each range. \n",
			
 
				     "\n",
			
@@ -235,7 +235,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -316,7 +316,7 @@
 
				     "\n",
			
 
				     "| Compiler | Latest Version | Maintained by | Full or Partial Support |\n",
			
 
				     "| --- | --- | --- | --- |\n",
			
 
				-    "| HPC SDK| 20.11 | NVIDIA HPC SDK | Full 2.5 spec |\n",
			
 
				+    "| HPC SDK| 21.3 | NVIDIA HPC SDK | Full 2.5 spec |\n",
			
 
				     "| GCC | 10 | Mentor Graphics, SUSE | 2.0 spec, Limited Kernel directive support, No Unified Memory |\n",
			
 
				     "| CCE| latest | Cray | 2.0 Spec | \n"
			
 
				    ]
			
@@ -362,7 +362,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -549,7 +549,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU without managed memory\n",
			
 
				-    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla,lineinfo -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvc++ -acc -ta=tesla,lineinfo -Minfo=accel -o rdf rdf.cpp -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -690,7 +690,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.4"
			
 
				+   "version": "3.6.2"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openmp/nways_openmp.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/openmp/nways_openmp.ipynb
@@ -273,7 +273,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#Compile the code for muticore\n",
			
 
				-    "!cd ../../source_code/openmp && nvc++ -mp=multicore -Minfo=mp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -o rdf rdf.cpp"
			
 
				+    "!cd ../../source_code/openmp && nvc++ -mp=multicore -Minfo=mp -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -o rdf rdf.cpp"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -437,7 +437,7 @@
 
				     "| CCE| latest | Cray | 4.5 partial spec supported | \n",
			
 
				     "| XL | latest | IBM | 4.5 partial spec supported |\n",
			
 
				     "| Clang | 9.0 | Community | 4.5 partial spec supported |\n",
			
 
				-    "| HPC SDK | 20.11 | NVIDIA HPC SDK | 5.0 spec supported |\n",
			
 
				+    "| HPC SDK | 21.3 | NVIDIA HPC SDK | 5.0 spec supported |\n",
			
 
				     "\n"
			
 
				    ]
			
 
				   },
			
@@ -519,7 +519,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.4"
			
 
				+   "version": "3.6.2"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/serial/rdf_overview.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/serial/rdf_overview.ipynb
@@ -121,7 +121,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.4"
			
 
				+   "version": "3.6.2"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/stdpar/nways_stdpar.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/C/jupyter_notebook/stdpar/nways_stdpar.ipynb
@@ -160,7 +160,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#Compile the code for muticore\n",
			
 
				-    "!cd ../../source_code/stdpar && nvc++ -std=c++17 -stdpar=multicore -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -ltbb"
			
 
				+    "!cd ../../source_code/stdpar && nvc++ -std=c++17 -stdpar=multicore -o rdf rdf.cpp -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -ltbb"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -378,7 +378,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.7.4"
			
 
				+   "version": "3.6.2"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/C/source_code/serial/Makefile
+++ b/hpc/nways/nways_labs/nways_MD/English/C/source_code/serial/Makefile
@@ -3,7 +3,7 @@
 
				 CC := nvc++
			
 
				 CFLAGS := -O3 -w -ldl
			
 
				 ACCFLAGS := -Minfo=accel
			
 
				-NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt
			
 
				+NVTXLIB := -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt
			
 
				 
			
 
				 rdf: rdf.cpp
			
 
				 	${CC} ${CFLAGS} ${ACCFLAGS} -o rdf rdf.cpp ${NVTXLIB} 
			
--- a/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/cudafortran/nways_cuda.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/cudafortran/nways_cuda.ipynb
@@ -373,7 +373,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/cudafortran && nvfortran -cuda -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/cudafortran && nvfortran -cuda -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
--- a/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/doconcurrent/nways_doconcurrent.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/doconcurrent/nways_doconcurrent.ipynb
--- a/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/openacc/nways_openacc.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/openacc/nways_openacc.ipynb
@@ -143,7 +143,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#Compile the code for multicore\n",
			
 
				-    "!cd ../../source_code/openacc && nvfortran -acc -ta=multicore -Minfo=accel -o rdf nvtx.f90 rdf.f90 -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvfortran -acc -ta=multicore -Minfo=accel -o rdf nvtx.f90 rdf.f90 -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -201,7 +201,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Let's checkout the profiler's report. [Download the profiler output](../../source_code/openacc/rdf_multicore.qdrep) and open it via the GUI. From the timeline view, checkout the NVTX markers displays as part of threads. **Why are we using NVTX?** Please see the section on [Using NVIDIA Tools Extension (NVTX)](../profiling-c.ipynb#Using-NVIDIA-Tools-Extension-(NVTX)).\n",
			
 
				+    "Let's checkout the profiler's report. [Download the profiler output](../../source_code/openacc/rdf_multicore.qdrep) and open it via the GUI. From the timeline view, checkout the NVTX markers displays as part of threads. **Why are we using NVTX?** Please see the section on [Using NVIDIA Tools Extension (NVTX)](../../../../../profiler/English/jupyter_notebook/profiling.ipynb#Using-NVIDIA-Tools-Extension-(NVTX)).\n",
			
 
				     "\n",
			
 
				     "From the timeline view, right click on the nvtx row and click the \"show in events view\". Now you can see the nvtx statistic at the bottom of the window which shows the duration of each range. \n",
			
 
				     "\n",
			
@@ -233,7 +233,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -256,8 +256,7 @@
 
				     "\n",
			
 
				     "- Using `-ta=tesla:managed`, instruct the compiler to build for an NVIDIA Tesla GPU using \"CUDA Managed Memory\"\n",
			
 
				     "- Using `-Minfo` command-line option, we will see all output from the compiler. In this example, we use `-Minfo=accel` to only see the output corresponding to the accelerator (in this case an NVIDIA GPU).\n",
			
 
				-    "- The first line of the output, `round(float)`, tells us which function the following information is in reference to.\n",
			
 
				-    "- The line starting with 177, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `99, acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
			
 
				+    "- The line starting with 97, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `99, acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
			
 
				     "- The rest of the information concerns data movement. Compiler detected possible need to move data and handled it for us. We will get into this later in this lab.\n",
			
 
				     "\n",
			
 
				     "It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it. Now, let's profile the code."
			
@@ -323,7 +322,7 @@
 
				     "\n",
			
 
				     "| Compiler | Latest Version | Maintained by | Full or Partial Support |\n",
			
 
				     "| --- | --- | --- | --- |\n",
			
 
				-    "| HPC SDK| 20.11 | NVIDIA HPC SDK | Full 2.5 spec |\n",
			
 
				+    "| HPC SDK| 21.3 | NVIDIA HPC SDK | Full 2.5 spec |\n",
			
 
				     "| GCC | 10 | Mentor Graphics, SUSE | 2.0 spec, Limited Kernel directive support, No Unified Memory |\n",
			
 
				     "| CCE| latest | Cray | 2.0 Spec | \n"
			
 
				    ]
			
@@ -365,7 +364,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -398,9 +397,19 @@
 
				     "\n",
			
 
				     "If you only replaced the parallel directive with kernels (meaning only wrapping the loop with `!$acc kernels`), then the compiler feedback will look similar to below:\n",
			
 
				     "\n",
			
 
				-    "<img src=\"../images/kernel_feedback.png\">\n",
			
 
				+    "```\n",
			
 
				+    "rdf:\n",
			
 
				+    "     97, Generating implicit copyin(y(iconf,:),z(iconf,:),x(iconf,:)) [if not already present]\n",
			
 
				+    "         Generating implicit copy(g(:)) [if not already present]\n",
			
 
				+    "     99, Loop carried dependence due to exposed use of g(:) prevents parallelization\n",
			
 
				+    "         Accelerator serial kernel generated\n",
			
 
				+    "         Generating Tesla code\n",
			
 
				+    "         99, !$acc loop seq\n",
			
 
				+    "        101, !$acc loop seq\n",
			
 
				+    "    101, Loop carried dependence due to exposed use of g(:) prevents parallelization\n",
			
 
				+    "```\n",
			
 
				     "\n",
			
 
				-    "The line starting with 179, shows we created a serial kernel and the following loops will run in serial. When we use kernel directives, we let the compiler make decisions for us. In this case, the compiler thinks loop are not safe to parallelise due to dependency.\n",
			
 
				+    "The line starting with 99, shows we created a serial kernel and the following loops will run in serial. When we use kernel directives, we let the compiler make decisions for us. In this case, the compiler thinks loop are not safe to parallelise due to dependency.\n",
			
 
				     "\n",
			
 
				     "### OpenACC Independent Clause\n",
			
 
				     "\n",
			
@@ -426,7 +435,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla:managed,lineinfo  -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -435,7 +444,16 @@
 
				    "source": [
			
 
				     "Let's inspect the compiler feedback and see if it does what we expect it to do now. You should get a compiler feedback similar to the below:\n",
			
 
				     "\n",
			
 
				-    "<img src=\"../images/kernel_indep_feedback.png\">\n",
			
 
				+    "```\n",
			
 
				+    "rdf:\n",
			
 
				+    "     97, Generating implicit copyin(y(iconf,:),z(iconf,:),x(iconf,:)) [if not already present]\n",
			
 
				+    "         Generating implicit copy(g(:)) [if not already present]\n",
			
 
				+    "     99, Loop is parallelizable\n",
			
 
				+    "    101, Loop is parallelizable\n",
			
 
				+    "         Generating Tesla code\n",
			
 
				+    "         99, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%x\n",
			
 
				+    "        101,   ! blockidx%x threadidx%x auto-collapsed\n",
			
 
				+    "```\n",
			
 
				     "\n",
			
 
				     "We can see that the compiler knows that the loop is parallelisable (`99, Loop is parallelizable`). Note that the loop is parallelized using vector(128) which that the compiler generated instructions for chunk of data of length 128 (vector size being 128 per gang) `99, acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
			
 
				     "\n",
			
@@ -549,7 +567,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU without managed memory\n",
			
 
				-    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla,lineinfo -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openacc && nvfortran -acc -ta=tesla,lineinfo -Minfo=accel -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -558,9 +576,18 @@
 
				    "source": [
			
 
				     "Let us start inspecting the compiler feedback and see if it applied the optimizations. Below is the screenshot of expected compiler feedback after adding the `data` directives. \n",
			
 
				     "\n",
			
 
				-    "<img src=\"../images/data_feedback.png\">\n",
			
 
				+    "```\n",
			
 
				+    "rdf:\n",
			
 
				+    "     95, Generating copy(g(:)) [if not already present]\n",
			
 
				+    "         Generating copyin(y(y$sd8:(y$sd8-1)+y$sd8,y$sd8:(y$sd8-1)+y$sd8),z(z$sd7:(z$sd7-1)+z$sd7,z$sd7:(z$sd7-1)+z$sd7),x(x$sd9:(x$sd9-1)+x$sd9,x$sd9:(x$sd9-1)+x$sd9)) [if not already present]\n",
			
 
				+    "     98, Generating Tesla code\n",
			
 
				+    "         99, !$acc loop gang, vector(128) ! blockidx%x threadidx%x\n",
			
 
				+    "        100, !$acc loop seq\n",
			
 
				+    "    100, Loop carried dependence of g prevents parallelization\n",
			
 
				+    "         Loop carried backward dependence of g prevents vectorization\n",
			
 
				+    "```\n",
			
 
				     "\n",
			
 
				-    "You can see that on line 182, compiler is generating default present for `d_g2`, `d_x`,`d_z`, and `d_y` arrays. In other words, it is assuming that data is present on the GPU and it only copies data to the GPU only if the data do not exist.\n",
			
 
				+    "You can see that on line 95, compiler is generating default present for `g2`, `x`,`z`, and `y` arrays. In other words, it is assuming that data is present on the GPU and it only copies data to the GPU only if the data do not exist.\n",
			
 
				     "\n",
			
 
				     "\n",
			
 
				     "Make sure to validate the output by running the executable and validate the output. "
			
--- a/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/openmp/nways_openmp.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Fortran/jupyter_notebook/openmp/nways_openmp.ipynb
@@ -264,7 +264,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#Compile the code for muticore\n",
			
 
				-    "!cd ../../source_code/openmp && nvfortran -mp=multicore -Minfo=mp -I/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/include -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openmp && nvfortran -mp=multicore -Minfo=mp -I/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -348,7 +348,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "#compile for Tesla GPU\n",
			
 
				-    "!cd ../../source_code/openmp && nvfortran -mp=gpu -Minfo=mp -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/20.11/cuda/11.0/lib64 -lnvToolsExt"
			
 
				+    "!cd ../../source_code/openmp && nvfortran -mp=gpu -Minfo=mp -o rdf nvtx.f90 rdf.f90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64 -lnvToolsExt"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -440,7 +440,7 @@
 
				     "| CCE| latest | Cray | 4.5 partial spec supported | \n",
			
 
				     "| XL | latest | IBM | 4.5 partial spec supported |\n",
			
 
				     "| Clang | 9.0 | Community | 4.5 partial spec supported |\n",
			
 
				-    "| HPC SDK | 20.11 | NVIDIA HPC SDK | 5.0 spec supported |\n",
			
 
				+    "| HPC SDK | 21.3 | NVIDIA HPC SDK | 5.0 spec supported |\n",
			
 
				     "\n"
			
 
				    ]
			
 
				   },
			
@@ -534,7 +534,7 @@
 
				     "\n",
			
 
				     "Compare the execution time for the `Pair_Calculation` from the NVTX row (annotated in Red rectangle in the example screenshot) with the previous section. It is clear the using collapse clause improved the performance by extracting more parallelisim.\n",
			
 
				     "\n",
			
 
				-    "Feel free to checkout the [solution](../../source_code/openmp/SOLUTION/rdf_offload_collapse.cpp) to help you understand better."
			
 
				+    "Feel free to checkout the [solution](../../source_code/openmp/SOLUTION/rdf_offload_collapse.f90) to help you understand better."
			
 
				    ]
			
 
				   },
			
 
				   {