4 سال پیش · 8e4412ae4f
--- a/hpc/nways/Dockerfile_python
+++ b/hpc/nways/Dockerfile_python
@@ -0,0 +1,54 @@
 
				+
			
 
				+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				+
			
 
				+# To build the docker container, run: $ sudo docker build -t nways-labs:latest .
			
 
				+# To run: $ sudo docker run --rm -it --runtime nvidia -p 8888:8888 nways-labs:latest
			
 
				+# Finally, open http://localhost:8888/
			
 
				+
			
 
				+#FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+FROM nvidia/cuda:11.2.2-devel-ubuntu20.04
			
 
				+
			
 
				+
			
 
				+RUN apt-get -y update && \
			
 
				+        DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
			
 
				+        python3-dev \  
			
 
				+        python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
			
 
				+        rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+RUN pip3 install --no-cache-dir -U install setuptools pip
			
 
				+RUN pip3 install gdown
			
 
				+RUN pip3 install --no-cache-dir jupyter
			
 
				+RUN pip3 install --no-cache-dir "cupy-cuda112==8.6.0" \
			
 
				+    numba numpy scipy 
			
 
				+       
			
 
				+        
			
 
				+############################################
			
 
				+# NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
			
 
				+RUN apt-get update -y && \
			
 
				+        DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
			
 
				+        apt-transport-https \
			
 
				+        ca-certificates \
			
 
				+        gnupg \
			
 
				+        wget && \
			
 
				+        apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 && \
			
 
				+        echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list &&\
			
 
				+        apt-get update -y
			
 
				+        
			
 
				+
			
 
				+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
			
 
				+
			
 
				+# TO COPY the data
			
 
				+COPY nways_labs/ /labs/
			
 
				+
			
 
				+
			
 
				+RUN python3 /labs/nways_MD/English/Python/source_code/dataset.py
			
 
				+
			
 
				+#################################################
			
 
				+ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
			
 
				+ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/usr/local/bin:/bin:/usr/local/cuda/bin:/usr/bin${PATH:+:${PATH}}"
			
 
				+
			
 
				+RUN pip3 install --no-cache-dir MDAnalysis
			
 
				+
			
 
				+ADD nways_labs/ /labs
			
 
				+WORKDIR /labs
			
 
				+CMD service nginx start && jupyter notebook --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs
			
--- a/hpc/nways/README.md
+++ b/hpc/nways/README.md
@@ -1,10 +1,25 @@
 
				 # Nways to GPU programming
			
 
				-This repository contains mini applications for GPU Bootcamps (**Tested on NVIDIA driver 440.82**). This labs comprises Nways to GPU programming and contains below topics:
			
 
				+This repository contains mini applications for GPU Bootcamps (**Tested on NVIDIA driver 440.82**). This labs comprises Nways to GPU programming implemented with the following programning approaches:
			
 
				+
			
 
				+**C programming language**
			
 
				   - OpenACC
			
 
				   - Kokkos
			
 
				   - PSTL
			
 
				   - OpenMP
			
 
				   - CUDA C
			
 
				+  
			
 
				+  
			
 
				+  
			
 
				+**Fortran programming language**
			
 
				+  - do-concurrent
			
 
				+  - OpenACC
			
 
				+  - OpenMP
			
 
				+  - CUDA Fortran
			
 
				+  
			
 
				+  
			
 
				+**Python programming language**
			
 
				+  - CuPy
			
 
				+  - Numba
			
 
				 
			
 
				 We showcase above ways using mini applications in MD domain and CFD.
			
 
				 
			
@@ -18,13 +33,25 @@ To run this tutorial you will need a machine with NVIDIA GPU.
 
				 To start with, you will have to build a Docker or Singularity container.
			
 
				 
			
 
				 ### Docker Container
			
 
				-To build a docker container, run: 
			
 
				+To build a docker container for **C & Fortran**, run:
			
 
				+
			
 
				 `sudo docker build -t <imagename>:<tagnumber> .`
			
 
				 
			
 
				-For instance:
			
 
				+For instance :
			
 
				+
			
 
				+
			
 
				 `sudo docker build -t myimage:1.0 .`
			
 
				 
			
 
				-The code labs have been written using Jupyter notebooks and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8000 from the container, for instance, the following command would expose port 8000 inside the container as port 8000 on the lab machine:
			
 
				+While in the case of **Python**, you have to specify the dockerfile name using flag **"-f"**, therefore run:
			
 
				+
			
 
				+`sudo docker build -f <dockerfile name> -t <imagename>:<tagnumber> .`
			
 
				+
			
 
				+For example :
			
 
				+
			
 
				+`sudo docker build -f Dockerfile_python -t myimage:1.0 .`
			
 
				+
			
 
				+
			
 
				+For C, Fortran, and Python, the code labs have been written using Jupyter notebooks and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8000 from the container. For example, the following command would expose port 8000 inside the container as port 8000 on the lab machine:
			
 
				 
			
 
				 `sudo docker run --rm -it --gpus=all -p 8888:8888 myimage:1.0`
			
 
				 
			
@@ -40,13 +67,20 @@ Once inside the container, open the jupyter notebook in browser: http://localhos
 
				 
			
 
				 ### Singularity Container
			
 
				 
			
 
				-To build the singularity container, run: 
			
 
				+To build the singularity container for **C & Fortran**, run: 
			
 
				+
			
 
				 `singularity build nways.simg Singularity`
			
 
				 
			
 
				-and copy the files to your local machine to make sure changes are stored locally:
			
 
				+While in the case of **Python**, run:
			
 
				+
			
 
				+`singularity build nways.simg Singularity_python`
			
 
				+
			
 
				+Thereafter, for C, Fortran, and Python, copy the files to your local machine to make sure changes are stored locally:
			
 
				+
			
 
				 `singularity run nways.simg cp -rT /labs ~/labs`
			
 
				 
			
 
				 Then, run the container:
			
 
				+
			
 
				 `singularity run --nv nways.simg jupyter notebook --notebook-dir=~/labs`
			
 
				 
			
 
				 Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `nways_start.ipynb` notebook.
			
--- a/hpc/nways/Singularity_python
+++ b/hpc/nways/Singularity_python
@@ -0,0 +1,56 @@
 
				+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				+
			
 
				+Bootstrap: docker
			
 
				+#FROM: nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
			
 
				+FROM:  nvidia/cuda:11.2.2-devel-ubuntu20.04
			
 
				+
			
 
				+%environment
			
 
				+    export XDG_RUNTIME_DIR=
			
 
				+    export PATH="$PATH:/usr/local/bin:/usr/bin"
			
 
				+    export PATH=/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/bin:/usr/local/cuda/bin$PATH
			
 
				+    export LD_LIBRARY_PATH="/usr/include/python3.8:/usr/local/lib:/usr/local/lib/python3.8/dist-packages:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
			
 
				+    
			
 
				+
			
 
				+%post
			
 
				+    build_tmp=$(mktemp -d) && cd ${build_tmp}
			
 
				+
			
 
				+    apt-get -y update
			
 
				+    apt-get -y dist-upgrade 
			
 
				+    DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-dev \
			
 
				+	    m4 vim-nox emacs-nox nano zip \
			
 
				+ 	    python3-pip python3-setuptools nginx zip make build-essential libtbb-dev
			
 
				+    rm -rf /var/lib/apt/cache/* 
			
 
				+
			
 
				+    pip3 install --no-cache-dir -U install setuptools pip
			
 
				+    pip3 install gdown
			
 
				+    pip3 install --no-cache-dir jupyter
			
 
				+    pip3 install --no-cache-dir "cupy-cuda112==8.6.0" \
			
 
				+    numba numpy scipy
			
 
				+    pip3 install --upgrade MDAnalysis
			
 
				+
			
 
				+    apt-get install --no-install-recommends -y build-essential 
			
 
				+
			
 
				+    python3 /labs/nways_MD/English/Python/source_code/dataset.py
			
 
				+
			
 
				+
			
 
				+# NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
			
 
				+    apt-get update -y   
			
 
				+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg wget
			
 
				+    apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80
			
 
				+    echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list 
			
 
				+    apt-get update -y 
			
 
				+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
			
 
				+    apt-get install --no-install-recommends -y build-essential
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+    cd /
			
 
				+    rm -rf ${build_tmp}
			
 
				+
			
 
				+%files
			
 
				+    nways_labs/ /labs
			
 
				+%runscript
			
 
				+    "$@"
			
 
				+
			
 
				+%labels
			
 
				+    AUTHOR Tosin
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/LICENSE
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/LICENSE
@@ -0,0 +1,23 @@
 
				+Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
			
 
				+All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are met:
			
 
				+
			
 
				+* Redistributions of source code must retain the above copyright notice, this
			
 
				+  list of conditions and the following disclaimer.
			
 
				+
			
 
				+* Redistributions in binary form must reproduce the above copyright notice,
			
 
				+  this list of conditions and the following disclaimer in the documentation
			
 
				+  and/or other materials provided with the distribution.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
			
 
				+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
			
 
				+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
			
 
				+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/Final_Remarks.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/Final_Remarks.ipynb
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_RDF.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_RDF.ipynb
@@ -0,0 +1,323 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# CuPy Lab 3:  Solution\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "#### [<<--CuPy Lab 2](serial_RDF.ipynb)\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import cupy as cp\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "import cupy.cuda.nvtx as nvtx\n",
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "from timeit import default_timer as timer\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadhead(infile):\n",
			
 
				+    "    nconf   = infile.n_frames\n",
			
 
				+    "    _infile = infile.header\n",
			
 
				+    "    numatm  = _infile['natoms']\n",
			
 
				+    "    return numatm, nconf\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadframe(infile, numatm, nconf):\n",
			
 
				+    "\n",
			
 
				+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "\n",
			
 
				+    "    for i in range(nconf):\n",
			
 
				+    "        data = infile.readframes(i, i+1)\n",
			
 
				+    "        box = data[1]\n",
			
 
				+    "        atomset = data[0][0]\n",
			
 
				+    "        xbox = round(box[0][0], 8)\n",
			
 
				+    "        ybox = round(box[0][2],8)\n",
			
 
				+    "        zbox = round(box[0][5], 8)\n",
			
 
				+    "\n",
			
 
				+    "        for row in range(numatm):\n",
			
 
				+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
			
 
				+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
			
 
				+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
			
 
				+    "\n",
			
 
				+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### The Raw Kernel  pair_gpu acceleration code "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "raw_kernel = cp.RawKernel(r'''\n",
			
 
				+    "extern \"C\"\n",
			
 
				+    "__global__ void pair_gpu(\n",
			
 
				+    "\t\tconst double* d_x, const double* d_y, const double* d_z, \n",
			
 
				+    "\t\tunsigned long long int *d_g2, int numatm, int nconf, \n",
			
 
				+    "\t\tconst double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)\n",
			
 
				+    "{\n",
			
 
				+    "\tdouble r,cut,dx,dy,dz;\n",
			
 
				+    "\tint ig2,id1,id2;\n",
			
 
				+    "\tdouble box;\n",
			
 
				+    "\tbox=min(xbox,ybox);\n",
			
 
				+    "\tbox=min(box,zbox);\n",
			
 
				+    "\n",
			
 
				+    "\tdouble del=box/(2.0*d_bin);\n",
			
 
				+    "\tcut=box*0.5;\n",
			
 
				+    "\tint thisi;\n",
			
 
				+    "\tdouble n;\n",
			
 
				+    "\n",
			
 
				+    "\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n",
			
 
				+    "\tint maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));\n",
			
 
				+    "\n",
			
 
				+    "\tif ( i < maxi ) {\n",
			
 
				+    "\t\tthisi=bl*65535*128+i;\n",
			
 
				+    "\n",
			
 
				+    "\t\tn=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));\n",
			
 
				+    "\t\tid1=int(n);\n",
			
 
				+    "\t\tid2=thisi-(0.5*id1*(id1-1));\n",
			
 
				+    "\n",
			
 
				+    "\t\tfor (int frame=0;frame<nconf;frame++){\n",
			
 
				+    "\t\t\tdx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];\n",
			
 
				+    "\t\t\tdy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];\n",
			
 
				+    "\t\t\tdz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];\n",
			
 
				+    "\n",
			
 
				+    "\t\t\tdx=dx-xbox*(round(dx/xbox));\n",
			
 
				+    "\t\t\tdy=dy-ybox*(round(dy/ybox));\n",
			
 
				+    "\t\t\tdz=dz-zbox*(round(dz/zbox));\n",
			
 
				+    "\n",
			
 
				+    "\t\t\tr=sqrtf(dx*dx+dy*dy+dz*dz);\n",
			
 
				+    "\t\t\tif (r<cut) {\n",
			
 
				+    "\t\t\t\tig2=(int)(r/del);\n",
			
 
				+    "\t\t\t\tatomicAdd(&d_g2[ig2],2) ;\n",
			
 
				+    "\t\t\t}\n",
			
 
				+    "\t\t}\n",
			
 
				+    "\t}\n",
			
 
				+    "}\n",
			
 
				+    "''', 'pair_gpu')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### The Main Function"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "import os\n",
			
 
				+    "from pathlib import Path\n",
			
 
				+    "\n",
			
 
				+    "def main():\n",
			
 
				+    "   # start = timer()\n",
			
 
				+    "    ########## Input Details ###########\n",
			
 
				+    "    global xbox, ybox, zbox\n",
			
 
				+    "    inconf = 10\n",
			
 
				+    "    nbin   =np.int32(2000)\n",
			
 
				+    "    xbox   = np.float32(0)\n",
			
 
				+    "    ybox   =np.float32(0)\n",
			
 
				+    "    zbox   = np.float32(0)\n",
			
 
				+    "    \n",
			
 
				+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
			
 
				+    "    dataRoot = Path(fileDir).parents[1]\n",
			
 
				+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
			
 
				+    "    \n",
			
 
				+    "    infile = DCDFile(file)\n",
			
 
				+    "    pairfile = open(\"cupy_RDF.dat\", \"w+\")\n",
			
 
				+    "    stwo = open(\"cupy_Pair_entropy.dat\", \"w+\")\n",
			
 
				+    "\n",
			
 
				+    "    numatm, nconf = dcdreadhead(infile)\n",
			
 
				+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
			
 
				+    "    if inconf > nconf:\n",
			
 
				+    "        print(\"nconf is reset to {}\".format(nconf))\n",
			
 
				+    "    else:\n",
			
 
				+    "        nconf = inconf\n",
			
 
				+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
			
 
				+    "    #numatm = 100\n",
			
 
				+    "    sizef =  nconf * numatm\n",
			
 
				+    "    sizebin = nbin\n",
			
 
				+    "\n",
			
 
				+    "    ########### reading cordinates ##############\n",
			
 
				+    "    nvtx.RangePush(\"Read_File\")\n",
			
 
				+    "    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)\n",
			
 
				+    "    nvtx.RangePop()  # pop for reading file\n",
			
 
				+    "    print(\"Reading of input file is completed\")\n",
			
 
				+    "    ############# Stream from Host to Device #########################\n",
			
 
				+    "    d_x = cp.asarray(d_x)\n",
			
 
				+    "    d_y = cp.asarray(d_y)\n",
			
 
				+    "    d_z = cp.asarray(d_z)\n",
			
 
				+    "    d_g2 = cp.zeros(sizebin, dtype=cp.int64)\n",
			
 
				+    "\n",
			
 
				+    "    ############################## RAW KERNEL #################################################\n",
			
 
				+    "    nthreads = 128;\n",
			
 
				+    "    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);\n",
			
 
				+    "    nblock = (near2 / nthreads);\n",
			
 
				+    "    print(\" Initial blocks are {} and now changing to\".format(nblock))\n",
			
 
				+    "    maxblock = 65535\n",
			
 
				+    "    blockloop = int(nblock / maxblock)\n",
			
 
				+    "    if blockloop != 0:\n",
			
 
				+    "        nblock = maxblock\n",
			
 
				+    "    print(\"{} and will run over {} blockloops\".format(nblock, blockloop+1))\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePush(\"CuPy_Pair_gpu_Circulation\")\n",
			
 
				+    "    #t1 = timer()\n",
			
 
				+    "    for bl in range(blockloop+1):\n",
			
 
				+    "        raw_kernel((nblock,),(nthreads,), (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)) ## cupy raw kernel\n",
			
 
				+    "    \n",
			
 
				+    "    cp.cuda.Device(0).synchronize()\n",
			
 
				+    "    #print(\"Kernel compute time:\", timer() - t1)\n",
			
 
				+    "    \n",
			
 
				+    "    d_g2 = cp.asnumpy(d_g2)\n",
			
 
				+    "    nvtx.RangePop()  # pop for Pair Calculation\n",
			
 
				+    "    #############################################################################################\n",
			
 
				+    "    pi = math.acos(np.int64(-1.0))\n",
			
 
				+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
			
 
				+    "    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)\n",
			
 
				+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
			
 
				+    "    s2 =np.int64(0.0); s2bond = np.int64(0.0)\n",
			
 
				+    "    lngrbond = np.float32(0.0)\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del =box / (np.int64(2.0) * nbin)\n",
			
 
				+    "    gr = np.float32(0.0)\n",
			
 
				+    "    # loop to calculate entropy\n",
			
 
				+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
			
 
				+    "    for i in range(nbin):\n",
			
 
				+    "        rl = (i) * _del\n",
			
 
				+    "        ru = rl + _del\n",
			
 
				+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
			
 
				+    "        g2[i] = d_g2[i] / (nconf * numatm * nideal)\n",
			
 
				+    "        r = (i) * _del\n",
			
 
				+    "        temp = (i + 0.5) * _del\n",
			
 
				+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
			
 
				+    "\n",
			
 
				+    "        if r < np.int64(2.0):\n",
			
 
				+    "            gr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            gr = g2[i]\n",
			
 
				+    "        if gr < 1e-5:\n",
			
 
				+    "            lngr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngr = math.log(gr)\n",
			
 
				+    "        if g2[i] < 1e-6:\n",
			
 
				+    "            lngrbond = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngrbond = math.log(g2[i])\n",
			
 
				+    "        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)\n",
			
 
				+    "        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePop()  # pop for entropy Calculation\n",
			
 
				+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
			
 
				+    "    \n",
			
 
				+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
			
 
				+    "\n",
			
 
				+    "    print(\"#Freeing Host memory\")\n",
			
 
				+    "    del (d_x)\n",
			
 
				+    "    del (d_y)\n",
			
 
				+    "    del (d_z)\n",
			
 
				+    "    del (d_g2)\n",
			
 
				+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
			
 
				+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
			
 
				+    "    #total_time = timer() - start\n",
			
 
				+    "    #print(\"total time spent:\", total_time)\n",
			
 
				+    "    \n",
			
 
				+    "\n",
			
 
				+    "if __name__ == \"__main__\":\n",
			
 
				+    "    main()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "---\n",
			
 
				+    "### Output Files\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/output_files.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "### Profiling\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/cupy_nsys1.png\"/>\n",
			
 
				+    "<img src=\"../images/cupy_nsys3.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start_python.ipynb>HOME</a></p>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of the Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/serial_RDF.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/serial_RDF.ipynb
@@ -0,0 +1,412 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# CuPy Lab 2:  Serial Code Lab Assignment\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "#### [<<--CuPy Lab 1](cupy_guide.ipynb)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## A Recap on RDF\n",
			
 
				+    "\n",
			
 
				+    "- The radial distribution function (RDF) denoted as g(r) defines the probability of finding a particle at a distance r from another tagged particle. The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids. You can read more [here](https://en.wikibooks.org/wiki/Molecular_Simulation/Radial_Distribution_Functions).\n",
			
 
				+    "- The code complexity of the algorithm is $N^{2}$. \n",
			
 
				+    "- The input data for the serial code is fetched from a DCD binary trajectory file.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### The Serial Code\n",
			
 
				+    "- The cell below consists of two functions, namely **dcdreadhead** and **dcdreadframe**\n",
			
 
				+    "- The **dcdreadhead** function computes the total number of frames and atoms from the DCDFile **(input/alk.traj.dcd)**, while the **dcdreadframe** function reads 10 frames and 6720 atoms (note: each frame contains 6720 atoms) using the MDAnalysis library. \n",
			
 
				+    "- Both functions run on the Host (CPU) and are being called from the function **main()**.\n",
			
 
				+    "### <u>Cell 1</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import cupy as cp\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "import cupy.cuda.nvtx as nvtx\n",
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "from timeit import default_timer as timer\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadhead(infile):\n",
			
 
				+    "    nconf   = infile.n_frames\n",
			
 
				+    "    _infile = infile.header\n",
			
 
				+    "    numatm  = _infile['natoms']\n",
			
 
				+    "    return numatm, nconf\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadframe(infile, numatm, nconf):\n",
			
 
				+    "\n",
			
 
				+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "\n",
			
 
				+    "    for i in range(nconf):\n",
			
 
				+    "        data = infile.readframes(i, i+1)\n",
			
 
				+    "        box = data[1]\n",
			
 
				+    "        atomset = data[0][0]\n",
			
 
				+    "        xbox = round(box[0][0], 8)\n",
			
 
				+    "        ybox = round(box[0][2],8)\n",
			
 
				+    "        zbox = round(box[0][5], 8)\n",
			
 
				+    "\n",
			
 
				+    "        for row in range(numatm):\n",
			
 
				+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
			
 
				+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
			
 
				+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
			
 
				+    "\n",
			
 
				+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "##  pair_gpu function\n",
			
 
				+    "\n",
			
 
				+    "- The pair_gpu is the function where the main task of the RDF serial implementation is being executed. The function computes differences in xyz DCD frames.\n",
			
 
				+    "- The essence of njit(just-in-time) decorator is to get pair_gpu function to compile under no python mode, and this is important for good performance. \n",
			
 
				+    "- The decorator **@njit** or **@jit(nopython=True)** ensures that an exception is raised when compilation fails as a way to alert the user that a bug is found within the decorated function. You can read more [here](https://numba.pydata.org/numba-doc/latest/user/performance-tips.html).\n",
			
 
				+    "\n",
			
 
				+    "### <u>Cell 2</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from numba import njit\n",
			
 
				+    "\n",
			
 
				+    "@njit()\n",
			
 
				+    "def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del = box / (2.0 * d_bin)\n",
			
 
				+    "    cut = box * 0.5\n",
			
 
				+    "\n",
			
 
				+    "    for frame in range(nconf):\n",
			
 
				+    "       # print(\"\\n {}\".format(frame))\n",
			
 
				+    "        for id1 in range(numatm):\n",
			
 
				+    "            for id2 in range(numatm):\n",
			
 
				+    "                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]\n",
			
 
				+    "                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]\n",
			
 
				+    "                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]\n",
			
 
				+    "                dx = dx - xbox * (round(dx / xbox))\n",
			
 
				+    "                dy = dy - ybox * (round(dy / ybox))\n",
			
 
				+    "                dz = dz - zbox * (round(dz / zbox))\n",
			
 
				+    "\n",
			
 
				+    "                r = math.sqrt(dx * dx + dy * dy + dz * dz)\n",
			
 
				+    "                if r < cut :\n",
			
 
				+    "                    ig2  = int((r/_del))\n",
			
 
				+    "                    d_g2[ig2] = d_g2[ig2] + 1\n",
			
 
				+    "\n",
			
 
				+    "    return d_g2"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### Brief Analysis on Tasks Performed within pair_gpu function\n",
			
 
				+    "- The graphic below identifies the various operations executed in the pair_gpu function. This function executes three nested loops using tricky indexing manipulation within the arrays.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/pair_gpu.png\" width=\"80%\"/>\n",
			
 
				+    "\n",
			
 
				+    "- The indexing flow for the operation 1 is simulated using the graphic below. Each green box simulates the subtraction operation within the two inner loops (id1 & id2) while the indexes written in blue signifies the outer-most loop (frame) which iterates 10 times. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/pair_gpu_analysis.png\" width=\"80%\"/>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### The Main Function\n",
			
 
				+    "- This is the entry point of the program where every other function including the **pair_gpu** function are called. The output of the main function is written into two files. An image version of the output files (\"**cupy_RDF.dat**\" & \"**cupy_Pair_entropy.dat**\") are displayed below the code cell.\n",
			
 
				+    "\n",
			
 
				+    "### <u>Cell 3</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "import os\n",
			
 
				+    "from pathlib import Path\n",
			
 
				+    "\n",
			
 
				+    "def main():\n",
			
 
				+    "    start = timer()\n",
			
 
				+    "    ########## Input Details ###########\n",
			
 
				+    "    inconf = 10\n",
			
 
				+    "    nbin   = 2000\n",
			
 
				+    "    global xbox, ybox, zbox\n",
			
 
				+    "    \n",
			
 
				+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
			
 
				+    "    dataRoot = Path(fileDir).parents[1]\n",
			
 
				+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
			
 
				+    "    \n",
			
 
				+    "    infile = DCDFile(file)\n",
			
 
				+    "    pairfile = open(\"RDF.dat\", \"w+\")\n",
			
 
				+    "    stwo     = open(\"Pair_entropy.dat\", \"w+\")\n",
			
 
				+    "\n",
			
 
				+    "    numatm, nconf = dcdreadhead(infile)\n",
			
 
				+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
			
 
				+    "    if inconf > nconf:\n",
			
 
				+    "        print(\"nconf is reset to {}\".format(nconf))\n",
			
 
				+    "    else:\n",
			
 
				+    "        nconf = inconf\n",
			
 
				+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
			
 
				+    "    #numatm = 50\n",
			
 
				+    "    sizef =  nconf * numatm\n",
			
 
				+    "    sizebin = nbin\n",
			
 
				+    "    ########### reading cordinates ##############\n",
			
 
				+    "    nvtx.RangePush(\"Read_File\")\n",
			
 
				+    "    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)\n",
			
 
				+    "    nvtx.RangePop() # pop for reading file\n",
			
 
				+    "\n",
			
 
				+    "    h_g2 = np.zeros(sizebin, dtype=np.longlong)\n",
			
 
				+    "    print(\"Reading of input file is completed\")\n",
			
 
				+    "   \n",
			
 
				+    "    print(\"\\n {} {}\".format(nconf, numatm))\n",
			
 
				+    "    ############# This where we will concentrate #########################\n",
			
 
				+    "    nvtx.RangePush(\"Pair_Circulation\")\n",
			
 
				+    "    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)\n",
			
 
				+    "    nvtx.RangePop() #pop for Pair Calculation\n",
			
 
				+    "    ######################################################################\n",
			
 
				+    "    \n",
			
 
				+    "    pi = math.acos(np.int64(-1.0))\n",
			
 
				+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
			
 
				+    "    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)\n",
			
 
				+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
			
 
				+    "    s2 = np.int64(0.0);\n",
			
 
				+    "    s2bond = np.int64(0.0)\n",
			
 
				+    "    lngrbond = np.int64(0.0)\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del = box / (np.int64(2.0) * nbin)\n",
			
 
				+    "    gr = np.float32(0.0)\n",
			
 
				+    "    # loop to calculate entropy\n",
			
 
				+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
			
 
				+    "    for i in range(nbin):\n",
			
 
				+    "        rl = (i) * _del\n",
			
 
				+    "        ru = rl + _del\n",
			
 
				+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
			
 
				+    "        g2[i] = h_g2[i] / (nconf * numatm * nideal)\n",
			
 
				+    "        r = (i) * _del\n",
			
 
				+    "        temp = (i + 0.5) * _del\n",
			
 
				+    "        \n",
			
 
				+    "        #writing to file\n",
			
 
				+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
			
 
				+    "\n",
			
 
				+    "        if r < np.int64(2.0):\n",
			
 
				+    "            gr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            gr = g2[i]\n",
			
 
				+    "        if gr < 1e-5:\n",
			
 
				+    "            lngr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngr = math.log(gr)\n",
			
 
				+    "        if g2[i] < 1e-6:\n",
			
 
				+    "            lngrbond = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngrbond = math.log(g2[i])\n",
			
 
				+    "        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)\n",
			
 
				+    "        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePop() # pop for entropy Calculation\n",
			
 
				+    "    \n",
			
 
				+    "    #writing s2 and s2bond to file\n",
			
 
				+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
			
 
				+    "    \n",
			
 
				+    "    # printing s2 and s2bond to jupyter output\n",
			
 
				+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
			
 
				+    "\n",
			
 
				+    "    print(\"#Freeing Host memory\")\n",
			
 
				+    "    del(h_x)\n",
			
 
				+    "    del(h_y)\n",
			
 
				+    "    del(h_z)\n",
			
 
				+    "    del(h_g2)\n",
			
 
				+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
			
 
				+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
			
 
				+    "    \n",
			
 
				+    "\n",
			
 
				+    "if __name__ == \"__main__\":\n",
			
 
				+    "    #main()  "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "###  Output Files\n",
			
 
				+    "<table>\n",
			
 
				+    "    <tr>\n",
			
 
				+    "    <td>\n",
			
 
				+    "         <img src=\"../images/serial_output_file.png\" width=\"95%\" />\n",
			
 
				+    "    </td>\n",
			
 
				+    "    </tr>\n",
			
 
				+    "</table>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# Lab Task \n",
			
 
				+    "\n",
			
 
				+    "1. **Run the serial code from cell 1, 2, & 3**.\n",
			
 
				+    "    - Remove the **\"#\"** behind the **main()** before running the cell 3:\n",
			
 
				+    "    ```python\n",
			
 
				+    "       if __name__ == \"__main__\":\n",
			
 
				+    "                main()\n",
			
 
				+    "    ```\n",
			
 
				+    "2. **Now, let's start modifying the original code to CuPy code constructs.**\n",
			
 
				+    "> From the top menu, click on File, and Open **nways_serial.py** from the current directory at **Python/source_code/cupy** directory. Remember to SAVE your code after changes, and then run the cell below. \n",
			
 
				+    "> Hints: focus on the **pair_gpu** function and you may need to modify few lines in the **main** function as well."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%run ../../source_code/serial/nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "The output should be the following:\n",
			
 
				+    "\n",
			
 
				+    "```\n",
			
 
				+    "s2 value is -2.43191\n",
			
 
				+    "s2bond value is -3.87014\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "3. **Profile the code by running the cell bellow** "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "To view the profiler report, you need to [download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the graphical user interface (GUI). A sample expected profile report is shown below:\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/cupy_nsys1.png\"/>\n",
			
 
				+    "<img src=\"../images/cupy_nsys3.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "From the profile report, we can see that the pair_gpu function now takes milliseconds to run as compared to the serial version which takes more than 3 seconds as shown [here](../serial/rdf_overview.ipynb). \n",
			
 
				+    " \n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "### [View ](../../source_code/cupy/cupy_rdf.py) or [Run](../../jupyter_notebook/cupy/cupy_RDF.ipynb) Solution\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Post-Lab Summary\n",
			
 
				+    "\n",
			
 
				+    "If you would like to download this lab for later viewing, we recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page. This will ensure the images are copied as well. You can also execute the following cell block to create a zip-file of the files you've been working on and download it with the link below.\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%%bash\n",
			
 
				+    "cd ..\n",
			
 
				+    "rm -f nways_files.zip\n",
			
 
				+    "zip -r nways_files.zip *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
			
 
				+    "\n",
			
 
				+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start_python.ipynb>HOME</a></p>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of the Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_array.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_array.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_col_mult.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_col_mult.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cuda_cupy.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cuda_cupy.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy.JPG
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy.JPG
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_arch.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_arch.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_intro.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_intro.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_kernel_memory.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_kernel_memory.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys2.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys2.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys3.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys3.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_summary.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_summary.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/dcdfile.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/dcdfile.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_block.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_block.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_grid.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_grid.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/memory_architecture.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/memory_architecture.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys2.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys2.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_output_files.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_output_files.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/output_files.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/output_files.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu_analysis.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu_analysis.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rapids_package.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rapids_package.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/raw_kernel.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/raw_kernel.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rdf.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rdf.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf2.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf2.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cupy_profile.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cupy_profile.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_numba_profile.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_numba_profile.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output_file.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output_file.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profile.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profile.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profiler1.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profiler1.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.JPG
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.JPG
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_position.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_position.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/ufunc.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/ufunc.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/workflow.png
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/workflow.png
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_RDF.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_RDF.ipynb
@@ -0,0 +1,312 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "# \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Numba Lab 3: Solution\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "#### [<<-- Numba Lab 2](serial_RDF.ipynb)\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "import cupy.cuda.nvtx as nvtx\n",
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "from timeit import default_timer as timer\n",
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadhead(infile):\n",
			
 
				+    "    nconf   = infile.n_frames\n",
			
 
				+    "    _infile = infile.header\n",
			
 
				+    "    numatm  = _infile['natoms']\n",
			
 
				+    "    return numatm, nconf\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadframe(infile, numatm, nconf):\n",
			
 
				+    "\n",
			
 
				+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "\n",
			
 
				+    "    for i in range(nconf):\n",
			
 
				+    "        data = infile.readframes(i, i+1)\n",
			
 
				+    "        box = data[1]\n",
			
 
				+    "        atomset = data[0][0]\n",
			
 
				+    "        xbox = round(box[0][0], 8)\n",
			
 
				+    "        ybox = round(box[0][2],8)\n",
			
 
				+    "        zbox = round(box[0][5], 8)\n",
			
 
				+    "\n",
			
 
				+    "        for row in range(numatm):\n",
			
 
				+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
			
 
				+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
			
 
				+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
			
 
				+    "\n",
			
 
				+    "    return xbox, ybox, zbox, d_x, d_y, d_z\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### The Numba CUDA-jit  pair_gpu  acceleration code "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "@cuda.jit\n",
			
 
				+    "def pair_gpu_kernel(d_x, d_y,d_z, d_g2, numatm, nconf, xbox, ybox,zbox,d_bin, bl):\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del= box / (2.0 * d_bin)\n",
			
 
				+    "    cut = box * 0.5;\n",
			
 
				+    "    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x\n",
			
 
				+    "    maxi = min(int(0.5 * numatm * (numatm - 1) - (bl * 65535 * 128)), (65535 * 128))\n",
			
 
				+    "\n",
			
 
				+    "    if i < maxi:\n",
			
 
				+    "        thisi=bl * 65535 * 128+i\n",
			
 
				+    "        n = (0.5) * (1+ ( math.sqrt (1.0+4.0 * 2.0 * thisi)))\n",
			
 
				+    "        id1 = int(n)\n",
			
 
				+    "        id2 = thisi-(0.5 * id1 * (id1-1))\n",
			
 
				+    "        for frame in range(0, nconf):\n",
			
 
				+    "            t1 = int(frame * numatm+id1)\n",
			
 
				+    "            t2 = int(frame * numatm+id2)\n",
			
 
				+    "            dx = d_x[t1] - d_x[t2]\n",
			
 
				+    "            dy = d_y[t1] - d_y[t2]\n",
			
 
				+    "            dz = d_z[t1] - d_z[t2]\n",
			
 
				+    "            dx = dx - xbox * (round(dx / xbox))\n",
			
 
				+    "            dy = dy - ybox * (round(dy / ybox))\n",
			
 
				+    "            dz = dz - zbox * (round(dz / zbox))\n",
			
 
				+    "\n",
			
 
				+    "            r= math.sqrt(dx * dx+dy * dy+dz * dz)\n",
			
 
				+    "            if r < cut:\n",
			
 
				+    "                ig2=(int)(r / _del )\n",
			
 
				+    "                cuda.atomic.add(d_g2, ig2, 2)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "### The Main Function"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "import os\n",
			
 
				+    "from pathlib import Path\n",
			
 
				+    "\n",
			
 
				+    "def main():\n",
			
 
				+    "    #start = timer()\n",
			
 
				+    "    ########## Input Details ###########\n",
			
 
				+    "    global xbox, ybox, zbox\n",
			
 
				+    "    inconf = 10\n",
			
 
				+    "    nbin   =np.int32(2000)\n",
			
 
				+    "    xbox   = np.float32(0)\n",
			
 
				+    "    ybox   =np.float32(0)\n",
			
 
				+    "    zbox   = np.float32(0)\n",
			
 
				+    "    \n",
			
 
				+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
			
 
				+    "    dataRoot = Path(fileDir).parents[1]\n",
			
 
				+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
			
 
				+    "    \n",
			
 
				+    "    infile = DCDFile(file)\n",
			
 
				+    "\n",
			
 
				+    "    pairfile = open(\"numba_RDF.dat\", \"w+\")\n",
			
 
				+    "    stwo = open(\"numba_Pair_entropy.dat\", \"w+\")\n",
			
 
				+    "\n",
			
 
				+    "    numatm, nconf = dcdreadhead(infile)\n",
			
 
				+    "\n",
			
 
				+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
			
 
				+    "    if inconf > nconf:\n",
			
 
				+    "        print(\"nconf is reset to {}\".format(nconf))\n",
			
 
				+    "    else:\n",
			
 
				+    "        nconf = inconf\n",
			
 
				+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
			
 
				+    "\n",
			
 
				+    "    #numatm = 100\n",
			
 
				+    "    sizef =  nconf * numatm\n",
			
 
				+    "    sizebin = nbin\n",
			
 
				+    "\n",
			
 
				+    "    ########### reading cordinates ##############\n",
			
 
				+    "    nvtx.RangePush(\"Read_File\")\n",
			
 
				+    "    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)\n",
			
 
				+    "    nvtx.RangePop()  # pop for reading file\n",
			
 
				+    "    print(\"Reading of input file is completed\")\n",
			
 
				+    "\n",
			
 
				+    "    ############################## Numba KERNEL #################################################\n",
			
 
				+    "\n",
			
 
				+    "    nthreads = 128;\n",
			
 
				+    "    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);\n",
			
 
				+    "    nblock = (near2 / nthreads);\n",
			
 
				+    "    print(\" Initial blocks are {} and now changing to\".format(nblock))\n",
			
 
				+    "    maxblock = 65535\n",
			
 
				+    "    blockloop = int(nblock / maxblock)\n",
			
 
				+    "    if blockloop != 0:\n",
			
 
				+    "        nblock = maxblock\n",
			
 
				+    "    print(\"{} and will run over {} blockloops \\n\".format(nblock, blockloop+1))\n",
			
 
				+    "\n",
			
 
				+    "    # cp.cuda.runtime.memset(d_g2,0,sizebin)\n",
			
 
				+    "    d_g2 = np.zeros(sizebin, dtype=np.int64)\n",
			
 
				+    "    d_g2 = cuda.to_device(d_g2) #numba copy to device\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePush(\"Pair_Circulation_Numba\")\n",
			
 
				+    "    #t1 = timer()\n",
			
 
				+    "    for bl in range(blockloop+1):\n",
			
 
				+    "        pair_gpu_kernel[nblock,nthreads ](d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)  ## numba jit kernel\n",
			
 
				+    "    \n",
			
 
				+    "    cuda.synchronize()\n",
			
 
				+    "   # print(\"Kernel compute time:\", timer() - t1)\n",
			
 
				+    "    d_g2  = d_g2.copy_to_host() ## numba copy to host\n",
			
 
				+    "    nvtx.RangePop()  # pop for Pair Calculation\n",
			
 
				+    "\n",
			
 
				+    "    pi = math.acos(np.int64(-1.0))\n",
			
 
				+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
			
 
				+    "    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)\n",
			
 
				+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
			
 
				+    "    s2 =np.int64(0.0); s2bond = np.int64(0.0)\n",
			
 
				+    "    lngrbond = np.float32(0.0)\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del =box / (np.int64(2.0) * nbin)\n",
			
 
				+    "    gr = np.float32(0.0)\n",
			
 
				+    "    # loop to calculate entropy\n",
			
 
				+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
			
 
				+    "    for i in range(nbin):\n",
			
 
				+    "        rl = (i) * _del\n",
			
 
				+    "        ru = rl + _del\n",
			
 
				+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
			
 
				+    "        g2[i] = d_g2[i] / (nconf * numatm * nideal)\n",
			
 
				+    "        r = (i) * _del\n",
			
 
				+    "        temp = (i + 0.5) * _del\n",
			
 
				+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
			
 
				+    "\n",
			
 
				+    "        if r < np.int64(2.0):\n",
			
 
				+    "            gr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            gr = g2[i]\n",
			
 
				+    "        if gr < 1e-5:\n",
			
 
				+    "            lngr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngr = math.log(gr)\n",
			
 
				+    "        if g2[i] < 1e-6:\n",
			
 
				+    "            lngrbond = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngrbond = math.log(g2[i])\n",
			
 
				+    "        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)\n",
			
 
				+    "        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePop()  # pop for entropy Calculation\n",
			
 
				+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
			
 
				+    "    \n",
			
 
				+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
			
 
				+    "     \n",
			
 
				+    "    print(\"#Freeing Host memory\")\n",
			
 
				+    "    del (d_x)\n",
			
 
				+    "    del (d_y)\n",
			
 
				+    "    del (d_z)\n",
			
 
				+    "    del (d_g2)\n",
			
 
				+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
			
 
				+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
			
 
				+    "    #total_time = timer() - start\n",
			
 
				+    "    #print(\"total time spent:\", total_time)\n",
			
 
				+    "    \n",
			
 
				+    "if __name__ == \"__main__\":\n",
			
 
				+    "    main()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Output Files\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/numba_output_files.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "### Profiling Sample\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/numba_nsys1.png\"/>\n",
			
 
				+    "<img src=\"../images/numba_nsys2.png\"/>\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start_python.ipynb>HOME</a></p>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of the Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb
@@ -0,0 +1,603 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# \n",
			
 
				+    "\n",
			
 
				+    "#  Numba Lab1: Numba For CUDA GPU\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "## Learning Objectives\n",
			
 
				+    "- **The goal of this lab is to:**\n",
			
 
				+    "    -   enable you to quickly start using Numba (beginner to advanced level)\n",
			
 
				+    "    -   teach you to apply the concepts of CUDA GPU programming to HPC field(s); and\n",
			
 
				+    "    -   show you how to achieve computational speedup on GPUs to maximize the throughput of your HPC implementation.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "Before we begin, let's execute the cell below to display information about the CUDA driver and GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by clicking on it with your mouse, and pressing Ctrl-Enter, or pressing the play button in the toolbar above. You should see some output returned below the grey cell."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!nvidia-smi"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "     \n",
			
 
				+    "##  Introduction\n",
			
 
				+    "- Numba is a just-in-time (jit) compiler for Python that works best on code that uses NumPy arrays, functions, and loops. Numba has sets of decorators that can be specified at the top of user-defined functions to determine how they are compiled.  \n",
			
 
				+    "- Numba supports CUDA GPU programming model. Decorated function written in python is compiled into a CUDA kernel to speed up the execution rate. \n",
			
 
				+    "- A kernel written in Numba automatically has direct access to NumPy arrays. This shows great support for data visibility between the host (CPU) and the device (GPU). \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "###  Definition of Terms\n",
			
 
				+    "- The CPU is called a **Host**.  \n",
			
 
				+    "- The GPU is called a **Device**.\n",
			
 
				+    "- A GPU function launched by the host and executed on the device is called a **Kernel**.\n",
			
 
				+    "- A GPU function executed on the device and can only be called from the device is called a **Device function**.\n",
			
 
				+    "\n",
			
 
				+    "### Note\n",
			
 
				+    "- It is recommended to visit the NVIDIA official documentation web page and read through [CUDA C programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide), because most CUDA programming features exposed by Numba map directly to the CUDA C language offered by NVIDIA. \n",
			
 
				+    "- Numba does not implement these CUDA features:\n",
			
 
				+    "     - dynamic parallelism\n",
			
 
				+    "     - texture memory\n",
			
 
				+    "\n",
			
 
				+    "## CUDA Kernel\n",
			
 
				+    "- In CUDA, written code can be executed by hundreds or thousands of threads at a single run, hence, a solution is modeled after the following thread hierarchy: \n",
			
 
				+    "    - **Grid**: A kernel executed as a collection of blocks. \n",
			
 
				+    "    - **Thread Block**: Collection of threads that can communicate via shared memory. Each thread is executed by a core.\n",
			
 
				+    "    - **Thread**: Single execution units that run kernels on GPU.\n",
			
 
				+    "- Numba exposes three kinds of GPU memory: \n",
			
 
				+    "    - global device memory  \n",
			
 
				+    "    - shared memory \n",
			
 
				+    "    - local memory \n",
			
 
				+    "- Memory access should be carefully considered in order to keep bandwidth contention at minimal.\n",
			
 
				+    "\n",
			
 
				+    " <img src=\"../images/thread_blocks.JPG\"/> <img src=\"../images/memory_architecture.png\"/> \n",
			
 
				+    "\n",
			
 
				+    "### Kernel Declaration\n",
			
 
				+    "- A kernel function is a GPU function that is called from a CPU code. It requires specifying the number of blocks and threads per block and cannot explicitly return a value except through a passed array. \n",
			
 
				+    "- A kernel can be called multiple times with varying number of blocks per grid and threads per block after it has been compiled once.\n",
			
 
				+    "\n",
			
 
				+    "Example:\n",
			
 
				+    "\n",
			
 
				+    "```python\n",
			
 
				+    "@cuda.jit\n",
			
 
				+    "def arrayAdd(array_A, array_B, array_out):\n",
			
 
				+    "    #...code body ...\n",
			
 
				+    "```\n",
			
 
				+    "###### Kernel Invocation\n",
			
 
				+    "- A kernel is typically launched in the following way:\n",
			
 
				+    "```python\n",
			
 
				+    "threadsperblock = 128\n",
			
 
				+    "N = array_out.size\n",
			
 
				+    "blockspergrid = ( N + (threadsperblock - 1))// threadsperblock\n",
			
 
				+    "arrayAdd[blockspergrid, threadsperblock](array_A, array_B, array_out)\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "###### Choosing Block Size\n",
			
 
				+    "- The block size determines how many threads share a given area of the shared memory.\n",
			
 
				+    "- The block size must be large enough to accommodate all computation units. See more details [here](https://docs.nvidia.com/cuda/cuda-c-programming-guide/).\n",
			
 
				+    "\n",
			
 
				+    "### Thread Positioning \n",
			
 
				+    "- When running a kernel, the kernel function’s code is executed by every thread once. Therefore, it is important to uniquely identify distinct threads.\n",
			
 
				+    "- The default way to determine a thread position in a grid and block is to manually compute the corresponding array positions:\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/thread_position.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "```python\n",
			
 
				+    "threadsperblock = 128\n",
			
 
				+    "N = array_out.size\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit\n",
			
 
				+    "def arrayAdd(array_A, array_B, array_out):\n",
			
 
				+    "    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "    if tid < N: #Check array boundaries\n",
			
 
				+    "        array_out[tid] =  array_A[tid] + array_B[tid]\n",
			
 
				+    "\n",
			
 
				+    "#Unless you are sure the block size and grid size are a divisor of your array size, you must check boundaries as shown in the code block above. \n",
			
 
				+    "```\n",
			
 
				+    "### Example 1: Addition on 1D-Arrays\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "\n",
			
 
				+    "N = 500000\n",
			
 
				+    "threadsperblock = 1000\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit()\n",
			
 
				+    "def arrayAdd(array_A, array_B, array_out):\n",
			
 
				+    "    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "    if tid < N:\n",
			
 
				+    "        array_out[tid] = array_A[tid] + array_B[tid]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "        \n",
			
 
				+    "array_A = np.arange(N, dtype=np.int32)\n",
			
 
				+    "array_B = np.arange(N, dtype=np.int32)\n",
			
 
				+    "array_out = np.zeros(N, dtype=np.int32)\n",
			
 
				+    "\n",
			
 
				+    "blockpergrid  = N + (threadsperblock - 1) // threadsperblock\n",
			
 
				+    "\n",
			
 
				+    "arrayAdd[blockpergrid, threadsperblock](array_A, array_B, array_out)\n",
			
 
				+    "\n",
			
 
				+    "print(\"result: {} \".format(array_out))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**From Example 1:** \n",
			
 
				+    "> - N is the size of the array and the number of threads in a single block is 128.\n",
			
 
				+    "> - The **cuda.jit()** decorator indicates that the function (arrayAdd) below is a device kernel and should run parallel. The **tid** is the estimate of a unique index for each thread in the device memory grid: \n",
			
 
				+    ">> **tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x**.\n",
			
 
				+    "> - **array_A** and **array_B** are input data, while **array_out** is the output array and is already preload with zeros.\n",
			
 
				+    "> - The statement **blockpergrid  = N + (threadsperblock - 1) // threadsperblock** computes the size of block per grid. This line of code is commonly use as the default formular to estimate the number of blocks per grid in GPU programming documentations.\n",
			
 
				+    "> - **arrayAdd[blockpergrid, threadsperblock](array_A, array_B, array_out)** indicate a call to a kernel function **arrayAdd** having the number of blocks per grid and number of threads per block in a square bracket, while kernel arguments are in a round bracket.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "###  Matrix Multiplication on 2D Array \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/2d_array.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/2d_col_mult.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "> **Note**\n",
			
 
				+    "> - **Approach 2** would not be possible if the matrix size exceeds the maximum number of threads per block on the device, while **Approach 1** would continue to execute. The latest GPUs have maximum of 1024 threads per thread block. \n",
			
 
				+    "\n",
			
 
				+    "### Example 2:  Matrix multiplication "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "\n",
			
 
				+    "N = 4\n",
			
 
				+    "@cuda.jit()\n",
			
 
				+    "def MatrixMul2D(array_A, array_B, array_out):\n",
			
 
				+    "   row, col = cuda.grid(2)\n",
			
 
				+    "   if row < array_out.shape[0] and col < array_out.shape[1]:\n",
			
 
				+    "      for k in range(N):\n",
			
 
				+    "         array_out[row][col]+= array_A[row][k] * array_B[k][col]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "array_A   = np.array([[0,0,0,0],[1,1,1,1],[2,2,2,2],[3,3,3,3]], dtype=np.int32)\n",
			
 
				+    "array_B   = np.array([[0,1,2,3],[0,1,2,3],[0,1,2,3],[0,1,2,3]], dtype=np.int32)\n",
			
 
				+    "array_out = np.zeros(N*N, dtype=np.int32).reshape(N, N)\n",
			
 
				+    "\n",
			
 
				+    "threadsperblock = (2,2)\n",
			
 
				+    "blockpergrid_x  = (math.ceil( N / threadsperblock[0]))\n",
			
 
				+    "blockpergrid_y  = (math.ceil( N / threadsperblock[1]))\n",
			
 
				+    "blockpergrid    = (blockpergrid_x, blockpergrid_y)\n",
			
 
				+    "\n",
			
 
				+    "MatrixMul2D[blockpergrid,threadsperblock](array_A, array_B, array_out)\n",
			
 
				+    "\n",
			
 
				+    "print(\"array_A:\\n {}\\n\".format(array_A))\n",
			
 
				+    "print(\"array_B:\\n {}\\n\".format(array_B))\n",
			
 
				+    "print(\"array_A * array_B:\\n {}\".format(array_out))\n",
			
 
				+    "\n",
			
 
				+    "#Note\n",
			
 
				+    "#The cuda.grid() returns the thread ID in X and Y (row & col) direction of the memory grid\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Example 3: A 225 × 225 Matrix Multiplication"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "N = 225\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit()\n",
			
 
				+    "def MatrixMul2D(array_A, array_B, array_out):\n",
			
 
				+    "   x, y = cuda.grid(2)\n",
			
 
				+    "   if x < array_out.shape[0] and y < array_out.shape[1]:\n",
			
 
				+    "      for k in range(N):\n",
			
 
				+    "         array_out[x][y] += array_A[x][k] * array_B[k][y]\n",
			
 
				+    "\n",
			
 
				+    "threadsperblock = (25,25)\n",
			
 
				+    "array_A = np.arange((N*N), dtype=np.int32).reshape(N,N)\n",
			
 
				+    "array_B = np.arange((N*N), dtype=np.int32).reshape(N,N)\n",
			
 
				+    "array_out = np.zeros((N*N), dtype=np.int32).reshape(N,N)\n",
			
 
				+    "\n",
			
 
				+    "blockpergrid_x  = (math.ceil( N / threadsperblock[0]))\n",
			
 
				+    "blockpergrid_y  = (math.ceil( N / threadsperblock[1]))\n",
			
 
				+    "blockpergrid    = (blockpergrid_x, blockpergrid_y)\n",
			
 
				+    "\n",
			
 
				+    "MatrixMul2D[blockpergrid,threadsperblock](array_A, array_B, array_out)\n",
			
 
				+    "\n",
			
 
				+    "print(array_out)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Thread Reuse \n",
			
 
				+    "\n",
			
 
				+    "- It is possible to specify a few numbers of threads for a data size such that threads are reused to complete the computation of the entire data. This is one of the approaches used when a data to be computed is larger than the maximum number of threads available in a device memory. \n",
			
 
				+    "- This statement is used in a while loop: ***tid += cuda.blockDim.x * cuda.gridDim.x***\n",
			
 
				+    "- An example is given below to illustrate thread reuse. In the example, a small number of threads is specified on purpose in order to show the possibility of this approach. \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "#### Example 4: "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "\n",
			
 
				+    "N = 500000\n",
			
 
				+    "threadsperblock = 1000\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit\n",
			
 
				+    "def arrayAdd(array_A, array_B, array_out):\n",
			
 
				+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "   while tid < N:\n",
			
 
				+    "      array_out[tid] = array_A[tid] + array_B[tid]\n",
			
 
				+    "      tid += cuda.blockDim.x * cuda.gridDim.x\n",
			
 
				+    "\n",
			
 
				+    "array_A = np.arange(N, dtype=np.int32)\n",
			
 
				+    "array_B = np.arange(N, dtype=np.int32)\n",
			
 
				+    "array_out = np.zeros(N, dtype=np.int32)\n",
			
 
				+    "\n",
			
 
				+    "arrayAdd[1, threadsperblock](array_A, array_B, array_out)\n",
			
 
				+    "\n",
			
 
				+    "print(\"result: {} \".format(array_out))\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "> **Note**\n",
			
 
				+    "> - The task in **example 4** is the same as in **example 1** but with limited number of threads specified, however, the same result was achieved. \n",
			
 
				+    "> - Note that this approach may delegate more threads than required. In the code above, an excess of 1 block of threads may be delegated.\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Memory Management\n",
			
 
				+    "\n",
			
 
				+    "### Data Transfer \n",
			
 
				+    "- When a kernel is executed, Numba automatically transfers NumPy arrays to the device and vice versa.\n",
			
 
				+    "- In order to avoid the unnecessary transfer for read-only arrays, the following APIs can be used to manually control the transfer.\n",
			
 
				+    "\n",
			
 
				+    "##### 1.  Copy host to device\n",
			
 
				+    "```python\n",
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "\n",
			
 
				+    "N = 500000\n",
			
 
				+    "h_A = np.arange(N, dtype=np.int)\n",
			
 
				+    "h_B = np.arange(N, dtype=np.int)\n",
			
 
				+    "h_C = np.zeros(N, dtype=np.int)\n",
			
 
				+    "\n",
			
 
				+    "d_A = cuda.to_device(h_A)\n",
			
 
				+    "d_B = cuda.to_device(h_B)\n",
			
 
				+    "d_C = cuda.to_device(h_C)\n",
			
 
				+    "```\n",
			
 
				+    "##### 2.  Enqueue the transfer to a stream\n",
			
 
				+    "```python\n",
			
 
				+    "h_A    = np.arange(N, dtype=np.int)\n",
			
 
				+    "stream = cuda.stream()\n",
			
 
				+    "d_A    = cuda.to_device(h_A, stream=stream)\n",
			
 
				+    "```\n",
			
 
				+    "##### 3.  Copy device to host / enqueue the transfer to a stream \n",
			
 
				+    "```python\n",
			
 
				+    "h_C = d_C.copy_to_host()\n",
			
 
				+    "h_C = d_C.copy_to_host(stream=stream)\n",
			
 
				+    "```\n",
			
 
				+    "### Example 5:  Data Movement "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "N = 200\n",
			
 
				+    "threadsperblock = 25\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit\n",
			
 
				+    "def arrayAdd(d_A, d_B, d_C):\n",
			
 
				+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "   if tid < N:\n",
			
 
				+    "      d_C[tid] = d_A[tid] + d_B[tid]\n",
			
 
				+    "      \n",
			
 
				+    "h_A = np.arange(N, dtype=np.int32)\n",
			
 
				+    "h_B = np.arange(N, dtype=np.int32)\n",
			
 
				+    "h_C = np.zeros(N, dtype=np.int32)\n",
			
 
				+    "\n",
			
 
				+    "d_A = cuda.to_device(h_A)\n",
			
 
				+    "d_B = cuda.to_device(h_B)\n",
			
 
				+    "d_C = cuda.to_device(h_C)\n",
			
 
				+    "\n",
			
 
				+    "blockpergrid  = N + (threadsperblock - 1) // threadsperblock\n",
			
 
				+    "arrayAdd[blockpergrid, threadsperblock](d_A, d_B, d_C)\n",
			
 
				+    "\n",
			
 
				+    "h_C = d_C.copy_to_host()\n",
			
 
				+    "print(h_C)\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Atomic Operation\n",
			
 
				+    "\n",
			
 
				+    "- Atomic operation is required when multiple threads attempt to modify a common portion of the memory. \n",
			
 
				+    "- A typical example includes simultaneous withdrawal from a bank account through ATM machine or a large number of threads modifying a particular index of an array based on certain condition(s).\n",
			
 
				+    "- List of presently implemented atomic operations supported by Numba are:\n",
			
 
				+    "> **import numba.cuda as cuda**\n",
			
 
				+    "> - cuda.atomic.add(array, index, value)\n",
			
 
				+    "> - cuda.atomic.min(array, index, value)\n",
			
 
				+    "> - cuda.atomic.max(array, index, value)\n",
			
 
				+    "> - cuda.atomic.nanmax(array, index, value)\n",
			
 
				+    "> - cuda.atomic.nanmin(array, index, value)\n",
			
 
				+    "> - cuda.atomic.compare_and_swap(array, old_value, current_value)\n",
			
 
				+    "> - cuda.atomic.sub(array, index, value)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Task ==> sum of an array: [1,2,3,4,5,6,7,8,9,10] in parallel\n",
			
 
				+    "# Note that threads are executed randomly\n",
			
 
				+    "\n",
			
 
				+    "# atomic operation example \n",
			
 
				+    "size = 10\n",
			
 
				+    "nthread = 10\n",
			
 
				+    "@cuda.jit()\n",
			
 
				+    "def add_atomic(my_array, total):\n",
			
 
				+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "   cuda.atomic.add(total,0, my_array[tid])\n",
			
 
				+    "\n",
			
 
				+    "my_array = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.int32)\n",
			
 
				+    "total = np.zeros(1, dtype=np.int32)\n",
			
 
				+    "nblock = int(size / nthread)\n",
			
 
				+    "add_atomic[nblock, nthread](my_array, total)\n",
			
 
				+    "print(\"Atomic:\", total)\n",
			
 
				+    "\n",
			
 
				+    "######################################################################################\n",
			
 
				+    "# Non-atomic operation example  \n",
			
 
				+    "size = 10\n",
			
 
				+    "nthread = 10\n",
			
 
				+    "@cuda.jit()\n",
			
 
				+    "def add_atomic(my_array, total):\n",
			
 
				+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
			
 
				+    "   total[0] += my_array[tid]\n",
			
 
				+    "   \n",
			
 
				+    "\n",
			
 
				+    "my_array = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.int32)\n",
			
 
				+    "total = np.zeros(1, dtype=np.int32)\n",
			
 
				+    "nblock = int(size / nthread)\n",
			
 
				+    "add_atomic[nblock, nthread](my_array, total)\n",
			
 
				+    "print(\"Non atomic: \", total)\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### 7. CUDA Ufuncs\n",
			
 
				+    "\n",
			
 
				+    "- The CUDA ufunc supports passing intra-device arrays to reduce traffic over the PCI-express bus. \n",
			
 
				+    "- It also supports asynchronous mode by using stream keyword.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/ufunc.png\"/>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# example: c = (a - b) * (a + b)\n",
			
 
				+    "# size of each array(A, B, C) is N = 10000\n",
			
 
				+    "\n",
			
 
				+    "from numba import vectorize\n",
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "\n",
			
 
				+    "@vectorize(['float32(float32, float32)'],target='cuda')\n",
			
 
				+    "def compute(a, b):\n",
			
 
				+    "    return (a - b) * (a + b)\n",
			
 
				+    "\n",
			
 
				+    "N = 10000\n",
			
 
				+    "A = np.arange(N , dtype=np.float32)\n",
			
 
				+    "B = np.arange(N, dtype=np.float32)\n",
			
 
				+    "C = compute(A, B)\n",
			
 
				+    "\n",
			
 
				+    "print(C.reshape(100,100))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Device Function\n",
			
 
				+    "\n",
			
 
				+    "- The CUDA device functions can only be invoked from within the device and can return a value like normal functions. The device function is usually placed before the CUDA ufunc kernel otherwise a call to the device function may not be visible inside the ufunc kernel.\n",
			
 
				+    "- The attributes <i>device=True</i> and <i>inline=true</i> indicate that <i>\"device_ufunc\"</i> is a device function."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "#example: c = sqrt((a - b) * (a + b))\n",
			
 
				+    "\n",
			
 
				+    "from numba import vectorize\n",
			
 
				+    "import numba.cuda as cuda\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "\n",
			
 
				+    "@cuda.jit('float32(float32)', device=True, inline=True)\n",
			
 
				+    "def device_ufunc(c):\n",
			
 
				+    "   return math.sqrt(c)\n",
			
 
				+    "\n",
			
 
				+    "@vectorize(['float32(float32, float32)'],target='cuda')\n",
			
 
				+    "def compute(a, b):\n",
			
 
				+    "    c = (a - b) * (a + b)\n",
			
 
				+    "    return device_ufunc(c)\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Summary\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/numba_summary1.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "## Lab Task\n",
			
 
				+    "\n",
			
 
				+    "In this section, you are expected to click on the **Serial Code Lab Assignment** link and proceed to Lab 2. In this lab you will find three python serial code functions. You are required to revise the **pair_gpu** function to run on the GPU, and likewise do a few modifications within the **main** function.\n",
			
 
				+    "\n",
			
 
				+    "## <div style=\"text-align:center; color:#FF0000; border:3px solid red;height:80px;\"> <b><br/> [Serial Code Lab Assignment](serial_RDF.ipynb) </b> </div>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "## Post-Lab Summary\n",
			
 
				+    "\n",
			
 
				+    "If you would like to download this lab for later viewing, we recommend you go to your browser's File menu (not the Jupyter notebook file menu) and save the complete web page. This will ensure the images are copied as well. You can also execute the following cell block to create a zip-file of the files you've been working on and download it with the link below.\n",
			
 
				+    "\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%%bash\n",
			
 
				+    "cd ..\n",
			
 
				+    "rm -f nways_files.zip\n",
			
 
				+    "zip -r nways_files.zip *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
			
 
				+    "\n",
			
 
				+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start_python.ipynb>HOME</a></p>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## References\n",
			
 
				+    "\n",
			
 
				+    "- Numba Documentation, Release 0.52.0-py3.7-linux-x86_64.egg, Anaconda, Nov 30, 2020.\n",
			
 
				+    "- Bhaumik Vaidya, Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA, Packt Publishing, 2018.\n",
			
 
				+    "- https://docs.nvidia.com/cuda/cuda-c-programming-guide/\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0  International (CC BY 4.0)."
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "celltoolbar": "Raw Cell Format",
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/serial_RDF.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/serial_RDF.ipynb
@@ -0,0 +1,410 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# \n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Numba Lab 2: HPC Approach with Serial Code\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "#### [<<--Numba Lab 1](numba_guide.ipynb)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## A Recap on RDF\n",
			
 
				+    "\n",
			
 
				+    "- The radial distribution function (RDF) denoted as g(r) defines the probability of finding a particle at a distance r from another tagged particle. The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids. You can read more [here](https://en.wikibooks.org/wiki/Molecular_Simulation/Radial_Distribution_Functions).\n",
			
 
				+    "- The code complexity of the algorithm is $N^{2}$. \n",
			
 
				+    "- The input data for the serial code is fetched from a DCD binary trajectory file.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### The Serial Code\n",
			
 
				+    "- The cell below consists of two functions, namely **dcdreadhead** and **dcdreadframe**\n",
			
 
				+    "- The **dcdreadhead** function computes the total number of frames and atoms from the DCDFile **(input/alk.traj.dcd)**, while the **dcdreadframe** function reads 10 frames and 6720 atoms (note: each frame contains 6720 atoms) using the MDAnalysis library. \n",
			
 
				+    "- Both functions run on the Host (CPU) and are being called from the function **main()**.\n",
			
 
				+    "\n",
			
 
				+    "### <u>Cell 1</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import cupy as cp\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import math\n",
			
 
				+    "import cupy.cuda.nvtx as nvtx\n",
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "from timeit import default_timer as timer\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadhead(infile):\n",
			
 
				+    "    nconf   = infile.n_frames\n",
			
 
				+    "    _infile = infile.header\n",
			
 
				+    "    numatm  = _infile['natoms']\n",
			
 
				+    "    return numatm, nconf\n",
			
 
				+    "\n",
			
 
				+    "def dcdreadframe(infile, numatm, nconf):\n",
			
 
				+    "\n",
			
 
				+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
			
 
				+    "\n",
			
 
				+    "    for i in range(nconf):\n",
			
 
				+    "        data = infile.readframes(i, i+1)\n",
			
 
				+    "        box = data[1]\n",
			
 
				+    "        atomset = data[0][0]\n",
			
 
				+    "        xbox = round(box[0][0], 8)\n",
			
 
				+    "        ybox = round(box[0][2],8)\n",
			
 
				+    "        zbox = round(box[0][5], 8)\n",
			
 
				+    "\n",
			
 
				+    "        for row in range(numatm):\n",
			
 
				+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
			
 
				+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
			
 
				+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
			
 
				+    "\n",
			
 
				+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "##  pair_gpu function\n",
			
 
				+    "\n",
			
 
				+    "- The pair_gpu is the function where the main task of the RDF serial implementation is being executed. The function computes differences in xyz DCD frames.\n",
			
 
				+    "- The essence of njit(just-in-time) decorator is to get pair_gpu function to compile under no python mode, and this is important for good performance. \n",
			
 
				+    "- The decorator **@njit** or **@jit(nopython=True)** ensures that an exception is raised when compilation fails as a way to alert the user that a bug is found within the decorated function. You can read more [here](https://numba.pydata.org/numba-doc/latest/user/performance-tips.html).\n",
			
 
				+    "\n",
			
 
				+    "### <u>Cell 2</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from numba import njit\n",
			
 
				+    "\n",
			
 
				+    "@njit()\n",
			
 
				+    "def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del = box / (2.0 * d_bin)\n",
			
 
				+    "    cut = box * 0.5\n",
			
 
				+    "    #print(\"\\n {} {}\".format(nconf, numatm))\n",
			
 
				+    "\n",
			
 
				+    "    for frame in range(nconf):\n",
			
 
				+    "        #print(\"\\n {}\".format(frame))\n",
			
 
				+    "        for id1 in range(numatm):\n",
			
 
				+    "            for id2 in range(numatm):\n",
			
 
				+    "                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]\n",
			
 
				+    "                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]\n",
			
 
				+    "                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]\n",
			
 
				+    "                dx = dx - xbox * (round(dx / xbox))\n",
			
 
				+    "                dy = dy - ybox * (round(dy / ybox))\n",
			
 
				+    "                dz = dz - zbox * (round(dz / zbox))\n",
			
 
				+    "\n",
			
 
				+    "                r = math.sqrt(dx * dx + dy * dy + dz * dz)\n",
			
 
				+    "                if r < cut :\n",
			
 
				+    "                    ig2  = int((r/_del))\n",
			
 
				+    "                    d_g2[ig2] = d_g2[ig2] + 1\n",
			
 
				+    "\n",
			
 
				+    "    return d_g2"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "#### Brief Analysis on Tasks Performed within pair_gpu function\n",
			
 
				+    "- The graphic below identifies the various operations executed in the pair_gpu function. This function executes three nested loops using tricky indexing manipulation within the arrays.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/pair_gpu.png\" width=\"80%\"/>\n",
			
 
				+    "\n",
			
 
				+    "- The indexing flow for the operation 1 is simulated using the graphic below. Each green box simulates the subtraction operation within the two inner loops (id1 & id2) while the indexes written in blue signifies the outer-most loop (frame) which iterates 10 times. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/pair_gpu_analysis.png\" width=\"80%\"/>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### The Main Function\n",
			
 
				+    "- This is the entry point of the program where every other function including the **pair_gpu** function are called. The output of the main function is written into two files. An image version of the output files (\"**cupy_RDF.dat**\" & \"**cupy_Pair_entropy.dat**\") are displayed below the code cell.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### <u>Cell 3</u>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
			
 
				+    "import os\n",
			
 
				+    "from pathlib import Path\n",
			
 
				+    "\n",
			
 
				+    "def main():\n",
			
 
				+    "    \n",
			
 
				+    "    ########## Input Details ###########\n",
			
 
				+    "    inconf = 10\n",
			
 
				+    "    nbin   = 2000\n",
			
 
				+    "    global xbox, ybox, zbox\n",
			
 
				+    "    \n",
			
 
				+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
			
 
				+    "    dataRoot = Path(fileDir).parents[1]\n",
			
 
				+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
			
 
				+    "    \n",
			
 
				+    "    infile = DCDFile(file)\n",
			
 
				+    "    pairfile = open(\"RDF.dat\", \"w+\")\n",
			
 
				+    "    stwo     = open(\"Pair_entropy.dat\", \"w+\")\n",
			
 
				+    "\n",
			
 
				+    "    numatm, nconf = dcdreadhead(infile)\n",
			
 
				+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
			
 
				+    "    if inconf > nconf:\n",
			
 
				+    "        print(\"nconf is reset to {}\".format(nconf))\n",
			
 
				+    "    else:\n",
			
 
				+    "        nconf = inconf\n",
			
 
				+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
			
 
				+    "    #numatm = 50\n",
			
 
				+    "    sizef =  nconf * numatm\n",
			
 
				+    "    sizebin = nbin\n",
			
 
				+    "    ########### reading cordinates ##############\n",
			
 
				+    "    nvtx.RangePush(\"Read_File\")\n",
			
 
				+    "    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)\n",
			
 
				+    "    nvtx.RangePop() # pop for reading file\n",
			
 
				+    "\n",
			
 
				+    "    h_g2 = np.zeros(sizebin, dtype=np.longlong)\n",
			
 
				+    "    print(\"Reading of input file is completed\")\n",
			
 
				+    "    print(\"\\n {} {}\".format(nconf, numatm))\n",
			
 
				+    "    ############# This where we will concentrate #########################\n",
			
 
				+    "    nvtx.RangePush(\"Pair_Circulation\")\n",
			
 
				+    "    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)\n",
			
 
				+    "    nvtx.RangePop() #pop for Pair Calculation\n",
			
 
				+    "    ######################################################################\n",
			
 
				+    "    \n",
			
 
				+    "    pi = math.acos(np.int64(-1.0))\n",
			
 
				+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
			
 
				+    "    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)\n",
			
 
				+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
			
 
				+    "    s2 = np.int64(0.0);\n",
			
 
				+    "    s2bond = np.int64(0.0)\n",
			
 
				+    "    lngrbond = np.int64(0.0)\n",
			
 
				+    "    box = min(xbox, ybox)\n",
			
 
				+    "    box = min(box, zbox)\n",
			
 
				+    "    _del = box / (np.int64(2.0) * nbin)\n",
			
 
				+    "    gr = np.float32(0.0)\n",
			
 
				+    "    # loop to calculate entropy\n",
			
 
				+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
			
 
				+    "    for i in range(nbin):\n",
			
 
				+    "        rl = (i) * _del\n",
			
 
				+    "        ru = rl + _del\n",
			
 
				+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
			
 
				+    "        g2[i] = h_g2[i] / (nconf * numatm * nideal)\n",
			
 
				+    "        r = (i) * _del\n",
			
 
				+    "        temp = (i + 0.5) * _del\n",
			
 
				+    "        \n",
			
 
				+    "        #writing to file\n",
			
 
				+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
			
 
				+    "\n",
			
 
				+    "        if r < np.int64(2.0):\n",
			
 
				+    "            gr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            gr = g2[i]\n",
			
 
				+    "        if gr < 1e-5:\n",
			
 
				+    "            lngr = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngr = math.log(gr)\n",
			
 
				+    "        if g2[i] < 1e-6:\n",
			
 
				+    "            lngrbond = np.int64(0.0)\n",
			
 
				+    "        else:\n",
			
 
				+    "            lngrbond = math.log(g2[i])\n",
			
 
				+    "        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)\n",
			
 
				+    "        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r\n",
			
 
				+    "\n",
			
 
				+    "    nvtx.RangePop() # pop for entropy Calculation\n",
			
 
				+    "    \n",
			
 
				+    "    #writing s2 and s2bond to file\n",
			
 
				+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
			
 
				+    "    \n",
			
 
				+    "    # printing s2 and s2bond to jupyter output\n",
			
 
				+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
			
 
				+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
			
 
				+    "\n",
			
 
				+    "    print(\"#Freeing Host memory\")\n",
			
 
				+    "    del(h_x)\n",
			
 
				+    "    del(h_y)\n",
			
 
				+    "    del(h_z)\n",
			
 
				+    "    del(h_g2)\n",
			
 
				+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
			
 
				+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
			
 
				+    "    \n",
			
 
				+    "\n",
			
 
				+    "if __name__ == \"__main__\":\n",
			
 
				+    "    #main()  "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "### Console Output and Output Files\n",
			
 
				+    "<table>\n",
			
 
				+    "    <tr>\n",
			
 
				+    "    <td>\n",
			
 
				+    "         <img src=\"../images/serial_output_file.png\" width=\"95%\" />\n",
			
 
				+    "    </td>\n",
			
 
				+    "    </tr>\n",
			
 
				+    "</table>\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "## Lab Task \n",
			
 
				+    "\n",
			
 
				+    "1. 1. **Run the serial code from cell 1, 2, & 3**.\n",
			
 
				+    "    - Remove the **\"#\"** behind the **main()** before running the cell 3:\n",
			
 
				+    "    ```python\n",
			
 
				+    "       if __name__ == \"__main__\":\n",
			
 
				+    "                main()\n",
			
 
				+    "    ```\n",
			
 
				+    "2. **Now, lets start modifying the original code to Numba code constructs.**\n",
			
 
				+    "> From the top menu, click on File, and Open **nways_serial.py** from the current directory at **Python/source_code/numba** directory. Remember to SAVE your code after changes, and then run the cell below. \n",
			
 
				+    "> Hints: focus on the **pair_gpu** function and you may need to modify few lines in the **main** function as well."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%run ../../source_code/serial/nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "The output should be the following:\n",
			
 
				+    "\n",
			
 
				+    "```\n",
			
 
				+    "s2 value is -2.43191\n",
			
 
				+    "s2bond value is -3.87014\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "3. **Profile the code by running the cell bellow** "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "To view the profiler report, you need to [download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the graphical user interface (GUI). A sample expected profile report is given below:\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/numba_nsys1.png\"/>\n",
			
 
				+    "<img src=\"../images/numba_nsys2.png\"/>\n",
			
 
				+    "\n",
			
 
				+    "From the profile report, we can see that the pair_gpu function now takes milliseconds to run as compared to the serial version which takes more than 3 seconds as shown [here](../serial/rdf_overview.ipynb). \n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "### [View](../../source_code/numba/numba_rdf.py) or [Run](../../jupyter_notebook/numba/numba_RDF.ipynb)  Solution \n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Post-Lab Summary\n",
			
 
				+    "\n",
			
 
				+    "If you would like to download this lab for later viewing, we recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page. This will ensure the images are copied as well. You can also execute the following cell block to create a zip-file of the files you've been working on and download it with the link below.\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%%bash\n",
			
 
				+    "cd ..\n",
			
 
				+    "rm -f nways_files.zip\n",
			
 
				+    "zip -r nways_files.zip *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
			
 
				+    "\n",
			
 
				+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start_python.ipynb>HOME</a></p>\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of the Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "---\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/rdf_overview.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/rdf_overview.ipynb
@@ -0,0 +1,137 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## RDF\n",
			
 
				+    "The radial distribution function (RDF) denoted as g(r) defines the probability of finding a particle at a distance r from another tagged particle. The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids. You can read more [here](https://en.wikibooks.org/wiki/Molecular_Simulation/Radial_Distribution_Functions).\n",
			
 
				+    "<img src=\"../images/rdf.png\" width=\"50%\" height=\"50%\">\n",
			
 
				+    " \n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "The code complexity of the algorithm is $N^{2}$ . Let us get into details of the serial code by clicking on the link below:\n",
			
 
				+    "\n",
			
 
				+    "[RDF Serial Code](../../source_code/serial/nways_serial.py)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "Open the downloaded file, analyze and understand the code if possible, and run the cell below."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%run ../../source_code/serial/nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We plan to follow a typical optimization cycle that every code need to go through\n",
			
 
				+    "<img src=\"../images/workflow.png\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "In order to analyze the application, we will make use of the NVIDIA Nsight System profiler \"nsys\" and add NVIDIA Tools Extension SDK  for annotation \"nvtx\" marking within the code to get more information out of the serial code. Before running the cell below, let's first start by diving into the profiler lab to learn more about the tools. Using profiler identifies the hotspots and helps us understand which function(s) are most important to parallelize.\n",
			
 
				+    "\n",
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[Profiling lab](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb)</div> \n",
			
 
				+    "\n",
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "Now, that we are familiar with the Nsight Profiler and know how to use [NVTX](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb#nvtx), let's profile the serial code and evaluate the output."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Once you run the above cell, you should see the following in the terminal:\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/serial_cpu_rdf1.png\" width=\"700px\" height=\"600px\"/>\n",
			
 
				+    "<img src=\"../images/serial_cpu_rdf2.png\" width=\"700px\" height=\"400px\"/>\n",
			
 
				+    "\n",
			
 
				+    "To view the profiler report, you need to [download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the graphical user interface (GUI). For more information on how to open the report via the GUI, please check out the section on [how to view the report](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb#gui-report). \n",
			
 
				+    "\n",
			
 
				+    "From the timeline view, right click on the nvtx row and click the \"show in events view\". You can see the nvtx statistic at the bottom of the window which shows the duration of each range. In the following labs, we will explore the profiler report in more detail. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"../images/serial_profile.png\" width=\"100%\" height=\"100%\"/>\n",
			
 
				+    "\n",
			
 
				+    "The next step is to make the **Pair Calculation** algorithm parallel using existing approaches within GPU Programming. Please follow the link below and choose one approach to parallelize the serial code.\n",
			
 
				+    "\n",
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[HOME](../../../nways_MD_start_python.ipynb)</div> \n",
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Links and Resources\n",
			
 
				+    "<!--[OpenACC API guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)-->\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "<!--[NVIDIA Nsight Compute](https://developer.nvidia.com/nsight-compute)-->\n",
			
 
				+    "\n",
			
 
				+    "<!--[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)-->\n",
			
 
				+    "\n",
			
 
				+    "[Profiling timelines with NVTX](https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download the latest version of NVIDIA Nsight System from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/serial_cpu_rdf.qdrep
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/serial_cpu_rdf.qdrep
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/cupy/cupy_rdf.py
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/cupy/cupy_rdf.py
@@ -0,0 +1,210 @@
 
				+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
			
 
				+
			
 
				+import cupy as cp
			
 
				+import numpy as np
			
 
				+import math
			
 
				+import cupy.cuda.nvtx as nvtx
			
 
				+from MDAnalysis.lib.formats.libdcd import DCDFile
			
 
				+from timeit import default_timer as timer
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+#pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
			
 
				+#cp.cuda.set_allocator(pool.malloc)
			
 
				+
			
 
				+
			
 
				+def dcdreadhead(infile):
			
 
				+    nconf   = infile.n_frames
			
 
				+    _infile = infile.header
			
 
				+    numatm  = _infile['natoms']
			
 
				+    return numatm, nconf
			
 
				+
			
 
				+def dcdreadframe(infile, numatm, nconf):
			
 
				+
			
 
				+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+
			
 
				+    for i in range(nconf):
			
 
				+        data = infile.readframes(i, i+1)
			
 
				+        box = data[1]
			
 
				+        atomset = data[0][0]
			
 
				+        xbox = round(box[0][0], 8)
			
 
				+        ybox = round(box[0][2],8)
			
 
				+        zbox = round(box[0][5], 8)
			
 
				+
			
 
				+        for row in range(numatm):
			
 
				+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
			
 
				+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
			
 
				+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
			
 
				+        
			
 
				+    return xbox, ybox, zbox, d_x, d_y, d_z
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    start = timer()
			
 
				+    ########## Input Details ###########
			
 
				+    global xbox, ybox, zbox
			
 
				+    inconf = 10
			
 
				+    nbin   =np.int32(2000)
			
 
				+    xbox   = np.float32(0)
			
 
				+    ybox   =np.float32(0)
			
 
				+    zbox   = np.float32(0)
			
 
				+
			
 
				+    ########use on jupyter notebook#######
			
 
				+    fileDir = os.path.dirname(os.path.realpath('__file__'))
			
 
				+    dataRoot = Path(fileDir).parents[1]
			
 
				+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
			
 
				+
			
 
				+    ########use on local computer##########
			
 
				+    #file   = "input/alk.traj.dcd"
			
 
				+    #######################################
			
 
				+    infile = DCDFile(file)
			
 
				+    pairfile = open("cupy_RDF.dat", "w+")
			
 
				+    stwo = open("cupy_Pair_entropy.dat", "w+")
			
 
				+
			
 
				+    numatm, nconf = dcdreadhead(infile)
			
 
				+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
			
 
				+    if inconf > nconf:
			
 
				+        print("nconf is reset to {}".format(nconf))
			
 
				+    else:
			
 
				+        nconf = inconf
			
 
				+    print("Calculating RDF for {} frames".format(nconf))
			
 
				+    #numatm = 10
			
 
				+    sizef =  nconf * numatm
			
 
				+    sizebin = nbin
			
 
				+
			
 
				+    ########### reading cordinates ##############
			
 
				+    nvtx.RangePush("Read_File")
			
 
				+    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)
			
 
				+    nvtx.RangePop()  # pop for reading file
			
 
				+    print("Reading of input file is completed")
			
 
				+    ############# Stream from Host to Device #########################
			
 
				+    d_x = cp.asarray(d_x)
			
 
				+    d_y = cp.asarray(d_y)
			
 
				+    d_z = cp.asarray(d_z)
			
 
				+    d_g2 = np.zeros(sizebin, dtype=np.int64)
			
 
				+    d_g2 = cp.asarray(d_g2)
			
 
				+    ############################## RAW KERNEL #################################################
			
 
				+    nthreads = 128;
			
 
				+    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);
			
 
				+    nblock = (near2 / nthreads);
			
 
				+    print(" Initial blocks are {} and now changing to".format(nblock))
			
 
				+    maxblock = 65535
			
 
				+    blockloop = int(nblock / maxblock)
			
 
				+    if blockloop != 0:
			
 
				+        nblock = maxblock
			
 
				+    print("{} and will run over {} blockloops".format(nblock, blockloop+1))
			
 
				+
			
 
				+    nvtx.RangePush("CuPy_Pair_Circulation")
			
 
				+    #################################
			
 
				+    t1 = timer()
			
 
				+    for bl in range(blockloop+1):
			
 
				+        raw_kernel((nblock,),(nthreads,), (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)) ## cupy raw kernel
			
 
				+    cp.cuda.Device(0).synchronize()
			
 
				+    print("Kernel compute time:", timer() - t1)
			
 
				+    d_g2 = cp.asnumpy(d_g2)
			
 
				+    nvtx.RangePop()  # pop for Pair Calculation
			
 
				+    ######################################################################
			
 
				+    pi = math.acos(np.int64(-1.0))
			
 
				+    rho = (numatm) / (xbox * ybox * zbox)
			
 
				+    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)
			
 
				+    g2 = np.zeros(nbin, dtype=np.float32)
			
 
				+    s2 =np.int64(0.0); s2bond = np.int64(0.0)
			
 
				+    lngrbond = np.float32(0.0)
			
 
				+    box = min(xbox, ybox)
			
 
				+    box = min(box, zbox)
			
 
				+    _del =box / (np.int64(2.0) * nbin)
			
 
				+    gr = np.float32(0.0)
			
 
				+    # loop to calculate entropy
			
 
				+    nvtx.RangePush("Entropy_Calculation")
			
 
				+    for i in range(nbin):
			
 
				+        rl = (i) * _del
			
 
				+        ru = rl + _del
			
 
				+        nideal = norm * (ru * ru * ru - rl * rl * rl)
			
 
				+        g2[i] = d_g2[i] / (nconf * numatm * nideal)
			
 
				+        r = (i) * _del
			
 
				+        temp = (i + 0.5) * _del
			
 
				+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
			
 
				+
			
 
				+        if r < np.int64(2.0):
			
 
				+            gr = np.int64(0.0)
			
 
				+        else:
			
 
				+            gr = g2[i]
			
 
				+        if gr < 1e-5:
			
 
				+            lngr = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngr = math.log(gr)
			
 
				+        if g2[i] < 1e-6:
			
 
				+            lngrbond = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngrbond = math.log(g2[i])
			
 
				+        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)
			
 
				+        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r
			
 
				+
			
 
				+    nvtx.RangePop()  # pop for entropy Calculation
			
 
				+    stwo.writelines("s2 value is {}\n".format(s2))
			
 
				+    stwo.writelines("s2bond value is {}".format(s2bond))
			
 
				+
			
 
				+    print("#Freeing Host memory")
			
 
				+    del (d_x)
			
 
				+    del (d_y)
			
 
				+    del (d_z)
			
 
				+    del (d_g2)
			
 
				+    print("#Number of atoms processed: {}  \n".format(numatm))
			
 
				+    print("#number of confs processed: {} \n".format(nconf))
			
 
				+    total_time = timer() - start
			
 
				+    print("total time spent:", total_time)
			
 
				+
			
 
				+##################################################################################
			
 
				+
			
 
				+raw_kernel = cp.RawKernel(r'''
			
 
				+extern "C"
			
 
				+__global__ void cupy_pair_gpu(
			
 
				+		const double* d_x, const double* d_y, const double* d_z, 
			
 
				+		unsigned long long int *d_g2, int numatm, int nconf, 
			
 
				+		const double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)
			
 
				+{
			
 
				+	double r,cut,dx,dy,dz;
			
 
				+	int ig2,id1,id2;
			
 
				+	double box;
			
 
				+	box=min(xbox,ybox);
			
 
				+	box=min(box,zbox);
			
 
				+
			
 
				+	double del=box/(2.0*d_bin);
			
 
				+	cut=box*0.5;
			
 
				+	int thisi;
			
 
				+	double n;
			
 
				+
			
 
				+	int i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+	int maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));
			
 
				+
			
 
				+	if ( i < maxi ) {
			
 
				+		thisi=bl*65535*128+i;
			
 
				+
			
 
				+		n=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));
			
 
				+		id1=int(n);
			
 
				+		id2=thisi-(0.5*id1*(id1-1));
			
 
				+
			
 
				+		for (int frame=0;frame<nconf;frame++){
			
 
				+			dx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];
			
 
				+			dy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];
			
 
				+			dz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];
			
 
				+
			
 
				+			dx=dx-xbox*(round(dx/xbox));
			
 
				+			dy=dy-ybox*(round(dy/ybox));
			
 
				+			dz=dz-zbox*(round(dz/zbox));
			
 
				+
			
 
				+			r=sqrtf(dx*dx+dy*dy+dz*dz);
			
 
				+			if (r<cut) {
			
 
				+				ig2=(int)(r/del);
			
 
				+				atomicAdd(&d_g2[ig2],2) ;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+''', 'cupy_pair_gpu')
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/dataset.py
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/dataset.py
@@ -0,0 +1,9 @@
 
				+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
			
 
				+
			
 
				+import gdown
			
 
				+import os
			
 
				+
			
 
				+## alk.traj.dcd input file 
			
 
				+url = 'https://drive.google.com/uc?id=1WZ0rtXZ-uMLfy7htT0gaU4EQ_Rq61QTF&export=download'
			
 
				+output_python = '/labs/nways_MD/English/Python/source_code/input/alk.traj.dcd'
			
 
				+gdown.download(url, output_python, quiet=False, proxy=None)
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/input/.gitignore
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/input/.gitignore
@@ -0,0 +1,4 @@
 
				+# Ignore everything in this directory
			
 
				+*
			
 
				+# Except this file
			
 
				+!.gitignore
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/numba/numba_rdf.py
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/numba/numba_rdf.py
@@ -0,0 +1,193 @@
 
				+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
			
 
				+
			
 
				+import numpy as np
			
 
				+import math
			
 
				+import cupy.cuda.nvtx as nvtx
			
 
				+from MDAnalysis.lib.formats.libdcd import DCDFile
			
 
				+from timeit import default_timer as timer
			
 
				+import numba.cuda as cuda
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def dcdreadhead(infile):
			
 
				+    nconf   = infile.n_frames
			
 
				+    _infile = infile.header
			
 
				+    numatm  = _infile['natoms']
			
 
				+    return numatm, nconf
			
 
				+
			
 
				+def dcdreadframe(infile, numatm, nconf):
			
 
				+
			
 
				+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+
			
 
				+    for i in range(nconf):
			
 
				+        data = infile.readframes(i, i+1)
			
 
				+        box = data[1]
			
 
				+        atomset = data[0][0]
			
 
				+        xbox = round(box[0][0], 8)
			
 
				+        ybox = round(box[0][2],8)
			
 
				+        zbox = round(box[0][5], 8)
			
 
				+
			
 
				+        for row in range(numatm):
			
 
				+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
			
 
				+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
			
 
				+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
			
 
				+
			
 
				+    return xbox, ybox, zbox, d_x, d_y, d_z
			
 
				+
			
 
				+def main():
			
 
				+    start = timer()
			
 
				+    ########## Input Details ###########
			
 
				+    global xbox, ybox, zbox
			
 
				+    inconf = 10
			
 
				+    nbin   =np.int32(2000)
			
 
				+    xbox   = np.float32(0)
			
 
				+    ybox   =np.float32(0)
			
 
				+    zbox   = np.float32(0)
			
 
				+
			
 
				+    ########use on jupyter notebook#######
			
 
				+    fileDir = os.path.dirname(os.path.realpath('__file__'))
			
 
				+    dataRoot = Path(fileDir).parents[1]
			
 
				+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
			
 
				+
			
 
				+    ########use on local computer##########
			
 
				+    # file   = "input/alk.traj.dcd"
			
 
				+    #######################################
			
 
				+
			
 
				+    infile = DCDFile(file)
			
 
				+
			
 
				+    pairfile = open("numba_RDF.dat", "w+")
			
 
				+    stwo = open("numba_Pair_entropy.dat", "w+")
			
 
				+
			
 
				+    numatm, nconf = dcdreadhead(infile)
			
 
				+
			
 
				+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
			
 
				+    if inconf > nconf:
			
 
				+        print("nconf is reset to {}".format(nconf))
			
 
				+    else:
			
 
				+        nconf = inconf
			
 
				+    print("Calculating RDF for {} frames".format(nconf))
			
 
				+
			
 
				+    #numatm = 100
			
 
				+    sizef =  nconf * numatm
			
 
				+    sizebin = nbin
			
 
				+
			
 
				+    ########### reading cordinates ##############
			
 
				+    nvtx.RangePush("Read_File")
			
 
				+    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)
			
 
				+    nvtx.RangePop()  # pop for reading file
			
 
				+    print("Reading of input file is completed")
			
 
				+
			
 
				+    ############################## Numba KERNEL #################################################
			
 
				+
			
 
				+    nthreads = 128;
			
 
				+    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);
			
 
				+    nblock = (near2 / nthreads);
			
 
				+    print(" Initial blocks are {} and now changing to".format(nblock))
			
 
				+    maxblock = 65535
			
 
				+    blockloop = int(nblock / maxblock)
			
 
				+    if blockloop != 0:
			
 
				+        nblock = maxblock
			
 
				+    print("{} and will run over {} blockloops".format(nblock, blockloop+1))
			
 
				+
			
 
				+    # cp.cuda.runtime.memset(d_g2,0,sizebin)
			
 
				+    d_g2 = np.zeros(sizebin, dtype=np.int64)
			
 
				+    d_g2 = cuda.to_device(d_g2) #numba copy to device
			
 
				+
			
 
				+    nvtx.RangePush("Pair_Circulation_Numba")
			
 
				+    t1 = timer()
			
 
				+    for bl in range(blockloop+1):
			
 
				+        pair_gpu_kernel[nblock,nthreads ](d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)  ## numba jit kernel
			
 
				+    cuda.synchronize()
			
 
				+    print("Kernel compute time:", timer() - t1)
			
 
				+    d_g2  = d_g2.copy_to_host() ## numba copy to host
			
 
				+    nvtx.RangePop()  # pop for Pair Calculation
			
 
				+
			
 
				+    pi = math.acos(np.int64(-1.0))
			
 
				+    rho = (numatm) / (xbox * ybox * zbox)
			
 
				+    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)
			
 
				+    g2 = np.zeros(nbin, dtype=np.float32)
			
 
				+    s2 =np.int64(0.0); s2bond = np.int64(0.0)
			
 
				+    lngrbond = np.float32(0.0)
			
 
				+    box = min(xbox, ybox)
			
 
				+    box = min(box, zbox)
			
 
				+    _del =box / (np.int64(2.0) * nbin)
			
 
				+    gr = np.float32(0.0)
			
 
				+    # loop to calculate entropy
			
 
				+    nvtx.RangePush("Entropy_Calculation")
			
 
				+    for i in range(nbin):
			
 
				+        rl = (i) * _del
			
 
				+        ru = rl + _del
			
 
				+        nideal = norm * (ru * ru * ru - rl * rl * rl)
			
 
				+        g2[i] = d_g2[i] / (nconf * numatm * nideal)
			
 
				+        r = (i) * _del
			
 
				+        temp = (i + 0.5) * _del
			
 
				+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
			
 
				+
			
 
				+        if r < np.int64(2.0):
			
 
				+            gr = np.int64(0.0)
			
 
				+        else:
			
 
				+            gr = g2[i]
			
 
				+        if gr < 1e-5:
			
 
				+            lngr = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngr = math.log(gr)
			
 
				+        if g2[i] < 1e-6:
			
 
				+            lngrbond = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngrbond = math.log(g2[i])
			
 
				+        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)
			
 
				+        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r
			
 
				+
			
 
				+    nvtx.RangePop()  # pop for entropy Calculation
			
 
				+    stwo.writelines("s2 value is {}\n".format(s2))
			
 
				+    stwo.writelines("s2bond value is {}".format(s2bond))
			
 
				+
			
 
				+    print("#Freeing Host memory")
			
 
				+    del (d_x)
			
 
				+    del (d_y)
			
 
				+    del (d_z)
			
 
				+    del (d_g2)
			
 
				+    print("#Number of atoms processed: {}  \n".format(numatm))
			
 
				+    print("#number of confs processed: {} \n".format(nconf))
			
 
				+    total_time = timer() - start
			
 
				+    print("total time spent:", total_time)
			
 
				+
			
 
				+
			
 
				+##--------------------------NUMBA KERNEL---------------------------------------------------------##
			
 
				+@cuda.jit
			
 
				+def pair_gpu_kernel(d_x, d_y,d_z, d_g2, numatm, nconf, xbox, ybox,zbox,d_bin, bl):
			
 
				+    box = min(xbox, ybox)
			
 
				+    box = min(box, zbox)
			
 
				+    _del= box / (2.0 * d_bin)
			
 
				+    cut = box * 0.5;
			
 
				+    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
			
 
				+    maxi = min(int(0.5 * numatm * (numatm - 1) - (bl * 65535 * 128)), (65535 * 128))
			
 
				+
			
 
				+    if i < maxi:
			
 
				+        thisi=bl * 65535 * 128+i
			
 
				+        n = (0.5) * (1+ ( math.sqrt (1.0+4.0 * 2.0 * thisi)))
			
 
				+        id1 = int(n)
			
 
				+        id2 = thisi-(0.5 * id1 * (id1-1))
			
 
				+        for frame in range(0, nconf):
			
 
				+            t1 = int(frame * numatm+id1)
			
 
				+            t2 = int(frame * numatm+id2)
			
 
				+            dx = d_x[t1] - d_x[t2]
			
 
				+            dy = d_y[t1] - d_y[t2]
			
 
				+            dz = d_z[t1] - d_z[t2]
			
 
				+            dx = dx - xbox * (round(dx / xbox))
			
 
				+            dy = dy - ybox * (round(dy / ybox))
			
 
				+            dz = dz - zbox * (round(dz / zbox))
			
 
				+
			
 
				+            r= math.sqrt(dx * dx+dy * dy+dz * dz)
			
 
				+            if r < cut:
			
 
				+                ig2=(int)(r / _del )
			
 
				+                cuda.atomic.add(d_g2, ig2, 2)
			
 
				+
			
 
				+##--------------------------END NUMBA KERNEL---------------------------------------------------------##
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/nways_serial.py
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/nways_serial.py
@@ -0,0 +1,160 @@
 
				+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
			
 
				+
			
 
				+import numpy as np
			
 
				+import math
			
 
				+import cupy.cuda.nvtx as nvtx
			
 
				+from MDAnalysis.lib.formats.libdcd import DCDFile
			
 
				+from timeit import default_timer as timer
			
 
				+from numba import  njit
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def dcdreadhead(infile):
			
 
				+    nconf   = infile.n_frames
			
 
				+    _infile = infile.header
			
 
				+    numatm  = _infile['natoms']
			
 
				+    return numatm, nconf
			
 
				+
			
 
				+def dcdreadframe(infile, numatm, nconf):
			
 
				+
			
 
				+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
			
 
				+
			
 
				+    for i in range(nconf):
			
 
				+        data = infile.readframes(i, i+1)
			
 
				+        box = data[1]
			
 
				+        atomset = data[0][0]
			
 
				+        xbox = round(box[0][0], 8)
			
 
				+        ybox = round(box[0][2],8)
			
 
				+        zbox = round(box[0][5], 8)
			
 
				+
			
 
				+        for row in range(numatm):
			
 
				+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
			
 
				+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
			
 
				+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
			
 
				+
			
 
				+    return xbox, ybox, zbox, d_x, d_y, d_z
			
 
				+
			
 
				+def main():
			
 
				+    start = timer()
			
 
				+    ########## Input Details ###########
			
 
				+    inconf = 10
			
 
				+    nbin   = 2000
			
 
				+    global xbox, ybox, zbox
			
 
				+    ######## for jupyter notebook ########################
			
 
				+    fileDir = os.path.dirname(os.path.realpath('__file__'))
			
 
				+    dataRoot = Path(fileDir).parents[1]
			
 
				+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
			
 
				+
			
 
				+    ######## local computer #############
			
 
				+    #file = "input/alk.traj.dcd"
			
 
				+
			
 
				+    infile = DCDFile(file)
			
 
				+    pairfile = open("RDF.dat", "w+")
			
 
				+    stwo     = open("Pair_entropy.dat", "w+")
			
 
				+
			
 
				+    numatm, nconf = dcdreadhead(infile)
			
 
				+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
			
 
				+    if inconf > nconf:
			
 
				+        print("nconf is reset to {}".format(nconf))
			
 
				+    else:
			
 
				+        nconf = inconf
			
 
				+    print("Calculating RDF for {} frames".format(nconf))
			
 
				+    #numatm = 50
			
 
				+    sizef =  nconf * numatm
			
 
				+    sizebin = nbin
			
 
				+    ########### reading cordinates ##############
			
 
				+    nvtx.RangePush("Read_File")
			
 
				+    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)
			
 
				+    nvtx.RangePop() # pop for reading file
			
 
				+
			
 
				+    h_g2 = np.zeros(sizebin, dtype=np.longlong)
			
 
				+    print("Reading of input file is completed")
			
 
				+    ############# This where we will concentrate #########################
			
 
				+    nvtx.RangePush("Pair_Circulation")
			
 
				+    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)
			
 
				+    nvtx.RangePop() #pop for Pair Calculation
			
 
				+    ######################################################################
			
 
				+    pi = math.acos(np.int64(-1.0))
			
 
				+    rho = (numatm) / (xbox * ybox * zbox)
			
 
				+    norm = (np.int64(4.0) * pi * rho) / np.int64(3.0)
			
 
				+    g2 = np.zeros(nbin, dtype=np.float32)
			
 
				+    s2 = np.int64(0.0);
			
 
				+    s2bond = np.int64(0.0)
			
 
				+    lngrbond = np.float32(0.0)
			
 
				+    box = min(xbox, ybox)
			
 
				+    box = min(box, zbox)
			
 
				+    _del = box / (np.int64(2.0) * nbin)
			
 
				+    gr = np.float32(0.0)
			
 
				+    # loop to calculate entropy
			
 
				+    nvtx.RangePush("Entropy_Calculation")
			
 
				+    for i in range(nbin):
			
 
				+        rl = (i) * _del
			
 
				+        ru = rl + _del
			
 
				+        nideal = norm * (ru * ru * ru - rl * rl * rl)
			
 
				+        g2[i] = h_g2[i] / (nconf * numatm * nideal)
			
 
				+        r = (i) * _del
			
 
				+        temp = (i + 0.5) * _del
			
 
				+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
			
 
				+
			
 
				+        if r < np.int64(2.0):
			
 
				+            gr = np.int64(0.0)
			
 
				+        else:
			
 
				+            gr = g2[i]
			
 
				+        if gr < 1e-5:
			
 
				+            lngr = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngr = math.log(gr)
			
 
				+        if g2[i] < 1e-6:
			
 
				+            lngrbond = np.int64(0.0)
			
 
				+        else:
			
 
				+            lngrbond = math.log(g2[i])
			
 
				+        s2 = s2 - (np.int64(2.0) * pi * rho * ((gr * lngr) - gr + np.int64(1.0)) * _del * r * r)
			
 
				+        s2bond = s2bond - np.int64(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.int64(1.0)) * _del * r * r
			
 
				+
			
 
				+    nvtx.RangePop() # pop for entropy Calculation
			
 
				+    stwo.writelines("s2 value is {}\n".format(s2))
			
 
				+    stwo.writelines("s2bond value is {}".format(s2bond))
			
 
				+
			
 
				+    print("\n s2 value is {}\n".format(s2))
			
 
				+    print("s2bond value is {}\n".format(s2bond))
			
 
				+    print("#Freeing Host memory")
			
 
				+    del(h_x)
			
 
				+    del(h_y)
			
 
				+    del(h_z)
			
 
				+    del(h_g2)
			
 
				+    print("#Number of atoms processed: {}  \n".format(numatm))
			
 
				+    print("#number of confs processed: {} \n".format(nconf))
			
 
				+    total_time = timer() - start
			
 
				+    print("total time spent:", total_time)
			
 
				+
			
 
				+@njit()
			
 
				+def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):
			
 
				+    box = min(xbox, ybox)
			
 
				+    box = min(box, zbox)
			
 
				+    _del = box / (2.0 * d_bin)
			
 
				+    cut = box * 0.5
			
 
				+    #print("\n {} {}".format(nconf, numatm))
			
 
				+
			
 
				+    for frame in range(nconf):
			
 
				+        #print("\n {}".format(frame))
			
 
				+        for id1 in range(numatm):
			
 
				+            for id2 in range(numatm):
			
 
				+                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]
			
 
				+                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]
			
 
				+                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]
			
 
				+                dx = dx - xbox * (round(dx / xbox))
			
 
				+                dy = dy - ybox * (round(dy / ybox))
			
 
				+                dz = dz - zbox * (round(dz / zbox))
			
 
				+
			
 
				+                r = math.sqrt(dx * dx + dy * dy + dz * dz)
			
 
				+                if r < cut :
			
 
				+                    ig2  = int((r/_del))
			
 
				+                    d_g2[ig2] = d_g2[ig2] + 1
			
 
				+
			
 
				+    return d_g2
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/serial_cpu_rdf.qdrep
+++ b/hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/serial_cpu_rdf.qdrep
--- a/hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb
@@ -72,7 +72,7 @@
 
				     "3. [OpenMP](Fortran/jupyter_notebook/openmp/nways_openmp.ipynb) \n",
			
 
				     "4. [CUDA Fortran](Fortran/jupyter_notebook/cudafortran/nways_cuda.ipynb) \n",
			
 
				     "\n",
			
 
				-    "To finish the lab let us go through some final [remarks](Fortran/jupyter_notebook/Final_Remarks.ipynb)\n"
			
 
				+    "To finish the lab let us go through some final [remarks](Fortran/jupyter_notebook/Final_Remarks.ipynb)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -90,7 +90,7 @@
 
				     "### Target Audience and Prerequisites\n",
			
 
				     "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
			
 
				     "\n",
			
 
				-    "Basic experience with Fortran programming is needed. No GPU programming knowledge is required.\n",
			
 
				+    "Basic experience with C/C++ or Fortran programming is needed. No GPU programming knowledge is required.\n",
			
 
				     "\n",
			
 
				     "-----\n",
			
 
				     "\n",
			
@@ -102,13 +102,6 @@
 
				     "\n",
			
 
				     "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				    ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				   }
			
 
				  ],
			
 
				  "metadata": {
			
@@ -127,7 +120,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.6.2"
			
 
				+   "version": "3.8.5"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/hpc/nways/nways_labs/nways_MD/English/nways_MD_start_python.ipynb
+++ b/hpc/nways/nways_labs/nways_MD/English/nways_MD_start_python.ipynb
--- a/hpc/nways/nways_labs/nways_MD/README.md
+++ b/hpc/nways/nways_labs/nways_MD/README.md
@@ -11,13 +11,27 @@ To run this tutorial you will need a machine with NVIDIA GPU.
 
				 To start with, you will have to build a Docker or Singularity container.
			
 
				 
			
 
				 ### Docker Container
			
 
				-To build a docker container, run: 
			
 
				+
			
 
				+#### C & Fortran
			
 
				+To build a docker container, run:
			
 
				+
			
 
				 `sudo docker build -t <imagename>:<tagnumber> .`
			
 
				 
			
 
				 For instance:
			
 
				+
			
 
				 `sudo docker build -t myimage:1.0 .`
			
 
				 
			
 
				-The code labs have been written using Jupyter notebooks and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8000 from the container, for instance, the following command would expose port 8000 inside the container as port 8000 on the lab machine:
			
 
				+#### Python
			
 
				+To build a docker container, you have to specify the dockerfile name using flag **"-f"**, hence run: 
			
 
				+
			
 
				+`sudo docker build -f <dockerfile name> -t <imagename>:<tagnumber> .`
			
 
				+
			
 
				+For example :
			
 
				+
			
 
				+`sudo docker build -f Dockerfile_python -t myimage:1.0 .`
			
 
				+
			
 
				+
			
 
				+For C, Fortran, and Python, the code labs have been written using Jupyter notebooks and a Dockerfile has been built to simplify deployment. In order to serve the docker instance for a student, it is necessary to expose port 8000 from the container. For example, the following command would expose port 8000 inside the container as port 8000 on the lab machine:
			
 
				 
			
 
				 `sudo docker run --rm -it --gpus=all -p 8888:8888 myimage:1.0`
			
 
				 
			
@@ -33,13 +47,23 @@ Once inside the container, open the jupyter notebook in browser: http://localhos
 
				 
			
 
				 ### Singularity Container
			
 
				 
			
 
				+#### C & Fortran
			
 
				 To build the singularity container, run: 
			
 
				+
			
 
				 `singularity build nways.simg Singularity`
			
 
				 
			
 
				-and copy the files to your local machine to make sure changes are stored locally:
			
 
				+#### Python
			
 
				+To build the singularity container, run: 
			
 
				+
			
 
				+`singularity build nways.simg Singularity_python`
			
 
				+
			
 
				+
			
 
				+For C, Fortran, and Python, copy the files to your local machine to make sure changes are stored locally:
			
 
				+
			
 
				 `singularity run nways.simg cp -rT /labs ~/labs`
			
 
				 
			
 
				 Then, run the container:
			
 
				+
			
 
				 `singularity run --nv nways.simg jupyter notebook --notebook-dir=~/labs`
			
 
				 
			
 
				 Once inside the container, open the jupyter notebook in browser: http://localhost:8888, and start the lab by clicking on the `START_nways.ipynb` notebook.
			
--- a/hpc/nways/nways_labs/nways_start.ipynb
+++ b/hpc/nways/nways_labs/nways_start.ipynb
@@ -6,14 +6,14 @@
 
				    "source": [
			
 
				     "## N Ways to GPU Programming\n",
			
 
				     "\n",
			
 
				-    "## Learning objectives\n",
			
 
				-    "With the release of CUDA in 2007, different approaches to programming GPUs have evolved. Each approach has its own advantages and disadvantages. By the end of this bootcamp session, students will have a broader perspective on GPU programming approaches to help them select a programming model that better fits their applications' needs and constraints. The bootcamp will teach how to accelerate a real world scientific application  using the following methods:\n",
			
 
				+    "## Learning Objectives\n",
			
 
				+    "With the release of NVIDIA CUDA in 2007, different approaches to GPU programming have evolved. Each approach has its own advantages and disadvantages. By the end of this bootcamp session, participants will have a broader perspective on GPU programming approaches to help them select a programming model that better fits their application's needs and constraints. The bootcamp will teach how to accelerate a real-world scientific application using the following methods:\n",
			
 
				     "* Standard: C++ stdpar, Fortran Do-Concurrent\n",
			
 
				     "* Directives: OpenACC, OpenMP\n",
			
 
				     "* Frameworks: Kokkos\n",
			
 
				-    "* Programming Language Extension: CUDA C, CUDA Fortran\n",
			
 
				+    "* Programming Language Extension: CUDA C, CUDA Fortran, Python CuPy, Python Numba\n",
			
 
				     "\n",
			
 
				-    "Let's start with testing the CUDA Driver and GPU you are running the code on in this lab:"
			
 
				+    "Let's start by testing the CUDA Driver and GPU you are running the code on in this lab:"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -31,9 +31,10 @@
 
				    "source": [
			
 
				     "### Tutorial Outline\n",
			
 
				     "\n",
			
 
				-    "During this lab, we will be working on porting mini applications in Molecular Simulation (MD) domain to GPUs. You can choose to work with either of this application. Please click on one of the below links to start:\n",
			
 
				+    "During this lab, we will be working on porting mini-applications in Molecular Simulation (MD) domain to GPUs. You can choose to work with either version of this application. Please click on one of the links below to start N Ways to GPU Programming in **MD** for:\n",
			
 
				     "\n",
			
 
				-    "- N Ways to GPU Programming in [MD](nways_MD/English/nways_MD_start.ipynb) domain\n"
			
 
				+    "- [ C and Fortran ](nways_MD/English/nways_MD_start.ipynb) domain\n",
			
 
				+    "- [Python ](nways_MD/English/nways_MD_start_python.ipynb) domain\n"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -41,21 +42,21 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "### Tutorial Duration\n",
			
 
				-    "The lab material will be presented in a 8hr session. Link to material is available for download at the end of the lab.\n",
			
 
				+    "The lab material will be presented in an 8-hour session. A Link to the material is available for download at the end of the lab.\n",
			
 
				     "\n",
			
 
				     "### Content Level\n",
			
 
				     "Beginner, Intermediate\n",
			
 
				     "\n",
			
 
				     "### Target Audience and Prerequisites\n",
			
 
				-    "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
			
 
				+    "The target audience for this lab are researchers/graduate students and developers who are interested in learning about various ways of GPU programming to accelerate their scientific applications.\n",
			
 
				     "\n",
			
 
				-    "Basic experience with C/C++ or Fortran programming is needed. No GPU programming knowledge is required. \n",
			
 
				+    "Basic experience with C/C++ or Python or Fortran programming is needed. No GPU programming knowledge is required. \n",
			
 
				     "\n",
			
 
				     "--- \n",
			
 
				     "\n",
			
 
				     "## Licensing \n",
			
 
				     "\n",
			
 
				-    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				    ]
			
 
				   }
			
 
				  ],
			
@@ -75,7 +76,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.6.2"
			
 
				+   "version": "3.8.5"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,