Tosin Akinwale Adesuyi 3 лет назад
Родитель
Сommit
fb5530db9c
56 измененных файлов с 3721 добавлено и 24 удалено
  1. 32 9
      hpc/nways/Dockerfile
  2. 23 0
      hpc/nways/nways_labs/nways_MD/English/Python/LICENSE
  3. 112 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/Final_Remarks.ipynb
  4. 323 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_RDF.ipynb
  5. 777 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb
  6. 413 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/serial_RDF.ipynb
  7. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_array.png
  8. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_col_mult.png
  9. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cuda_cupy.png
  10. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy.JPG
  11. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_arch.png
  12. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_intro.png
  13. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_kernel_memory.png
  14. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys1.png
  15. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys2.png
  16. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys3.png
  17. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_summary.png
  18. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/dcdfile.png
  19. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_block.png
  20. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_grid.png
  21. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/memory_architecture.png
  22. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys1.png
  23. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys2.png
  24. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_output_files.png
  25. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary.png
  26. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary1.png
  27. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/output_files.png
  28. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu.png
  29. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu_analysis.png
  30. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rapids_package.png
  31. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/raw_kernel.png
  32. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rdf.png
  33. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf1.png
  34. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf2.png
  35. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cupy_profile.png
  36. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_numba_profile.png
  37. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output1.png
  38. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output_file.png
  39. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profile.png
  40. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profiler1.png
  41. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.JPG
  42. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.png
  43. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_position.png
  44. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/ufunc.png
  45. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/workflow.png
  46. 312 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_RDF.ipynb
  47. 599 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb
  48. 411 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/serial_RDF.ipynb
  49. 130 0
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/rdf_overview.ipynb
  50. BIN
      hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/serial_cpu_rdf.qdrep
  51. 210 0
      hpc/nways/nways_labs/nways_MD/English/Python/source_code/cupy/cupy_rdf.py
  52. 9 0
      hpc/nways/nways_labs/nways_MD/English/Python/source_code/input/dataset.py
  53. 193 0
      hpc/nways/nways_labs/nways_MD/English/Python/source_code/numba/numba_rdf.py
  54. 157 0
      hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/nways_serial.py
  55. 17 12
      hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb
  56. 3 3
      hpc/nways/nways_labs/nways_start.ipynb

+ 32 - 9
hpc/nways/Dockerfile

@@ -1,3 +1,4 @@
+
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 
 # To build the docker container, run: $ sudo docker build -t nways-labs:latest .
@@ -5,14 +6,24 @@
 # Finally, open http://localhost:8888/
 
 #FROM nvcr.io/nvidia/nvhpc:20.11-devel-cuda_multi-ubuntu20.04
-FROM nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
+FROM nvidia/cuda:11.2.2-devel-ubuntu20.04
+#FROM nvcr.io/nvidia/nvhpc:21.3-devel-cuda_multi-ubuntu20.04
+
+
 
 RUN apt-get -y update && \
-        DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
-        rm -rf /var/lib/apt/lists/* && \
-        pip3 install --upgrade pip &&\
-        pip3 install --no-cache-dir jupyter &&\
-        pip3 install gdown
+        DEBIAN_FRONTEND=noninteractive apt-get -yq install --no-install-recommends \
+        python3-dev \  
+        python3-pip python3-setuptools nginx zip make build-essential libtbb-dev && \
+        rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir -U install setuptools pip
+RUN pip3 install gdown
+RUN pip3 install --no-cache-dir jupyter
+RUN pip3 install --no-cache-dir "cupy-cuda112==8.6.0" \
+    numba numpy scipy 
+RUN pip3 install --no-cache-dir MDAnalysis       
+        
 
 ############################################
 # NVIDIA nsight-systems-2020.5.1 ,nsight-compute-2
@@ -25,6 +36,9 @@ RUN apt-get update -y && \
         apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys F60F4B3D7FA2AF80 && \
         echo "deb https://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64/ /" >> /etc/apt/sources.list.d/nsight.list &&\
         apt-get update -y
+        #wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+        #bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3  && \
+        #rm Miniconda3-latest-Linux-x86_64.sh
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.5.1 nsight-compute-2020.2.1 
 
@@ -32,12 +46,21 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ns
 COPY nways_labs/ /labs/
 
 RUN python3 /labs/nways_MD/English/C/source_code/dataset.py
-RUN python3 /labs/nways_MD/English/Fortran/source_code/dataset.py
+#RUN python3 /labs/nways_MD/English/Fortran/source_code/dataset.py
+RUN python3 /labs/nways_MD/English/Python/source_code/dataset.py
 
 #################################################
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.3/lib64/"
-ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include:/usr/local/bin:/opt/anaconda3/bin:/usr/bin:$PATH"
+#ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+#ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/compilers/bin:/usr/local/bin:/opt/anaconda3/bin:/usr/bin${PATH:+:${PATH}}"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/lib64:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/lib64:/usr/local/lib/python3.8/dist-packages"
+ENV PATH="/opt/nvidia/nsight-systems/2020.5.1/bin:/opt/nvidia/nsight-compute/2020.2.1:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/11.2/include:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/cuda/bin:/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/compilers/bin:/usr/local/bin:/opt/anaconda3/bin:/opt/anaconda3/lib:/usr/bin:$PATH"
 #################################################
+#########################################
+ 
+     
+     
+########################################
 
 ADD nways_labs/ /labs
 WORKDIR /labs

+ 23 - 0
hpc/nways/nways_labs/nways_MD/English/Python/LICENSE

@@ -0,0 +1,23 @@
+Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Разница между файлами не показана из-за своего большого размера
+ 112 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/Final_Remarks.ipynb


+ 323 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_RDF.ipynb

@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# \n",
+    "\n",
+    "\n",
+    "# CuPy Lab 3:  Solution\n",
+    "---\n",
+    "\n",
+    "#### [<<--CuPy Lab 2](serial_RDF.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import cupy as cp\n",
+    "import numpy as np\n",
+    "import math\n",
+    "import cupy.cuda.nvtx as nvtx\n",
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "\n",
+    "\n",
+    "def dcdreadhead(infile):\n",
+    "    nconf   = infile.n_frames\n",
+    "    _infile = infile.header\n",
+    "    numatm  = _infile['natoms']\n",
+    "    return numatm, nconf\n",
+    "\n",
+    "def dcdreadframe(infile, numatm, nconf):\n",
+    "\n",
+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "\n",
+    "    for i in range(nconf):\n",
+    "        data = infile.readframes(i, i+1)\n",
+    "        box = data[1]\n",
+    "        atomset = data[0][0]\n",
+    "        xbox = round(box[0][0], 8)\n",
+    "        ybox = round(box[0][2],8)\n",
+    "        zbox = round(box[0][5], 8)\n",
+    "\n",
+    "        for row in range(numatm):\n",
+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
+    "\n",
+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The Raw Kernel  pair_gpu acceleration code "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_kernel = cp.RawKernel(r'''\n",
+    "extern \"C\"\n",
+    "__global__ void pair_gpu(\n",
+    "\t\tconst double* d_x, const double* d_y, const double* d_z, \n",
+    "\t\tunsigned long long int *d_g2, int numatm, int nconf, \n",
+    "\t\tconst double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)\n",
+    "{\n",
+    "\tdouble r,cut,dx,dy,dz;\n",
+    "\tint ig2,id1,id2;\n",
+    "\tdouble box;\n",
+    "\tbox=min(xbox,ybox);\n",
+    "\tbox=min(box,zbox);\n",
+    "\n",
+    "\tdouble del=box/(2.0*d_bin);\n",
+    "\tcut=box*0.5;\n",
+    "\tint thisi;\n",
+    "\tdouble n;\n",
+    "\n",
+    "\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "\tint maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));\n",
+    "\n",
+    "\tif ( i < maxi ) {\n",
+    "\t\tthisi=bl*65535*128+i;\n",
+    "\n",
+    "\t\tn=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));\n",
+    "\t\tid1=int(n);\n",
+    "\t\tid2=thisi-(0.5*id1*(id1-1));\n",
+    "\n",
+    "\t\tfor (int frame=0;frame<nconf;frame++){\n",
+    "\t\t\tdx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];\n",
+    "\t\t\tdy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];\n",
+    "\t\t\tdz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];\n",
+    "\n",
+    "\t\t\tdx=dx-xbox*(round(dx/xbox));\n",
+    "\t\t\tdy=dy-ybox*(round(dy/ybox));\n",
+    "\t\t\tdz=dz-zbox*(round(dz/zbox));\n",
+    "\n",
+    "\t\t\tr=sqrtf(dx*dx+dy*dy+dz*dz);\n",
+    "\t\t\tif (r<cut) {\n",
+    "\t\t\t\tig2=(int)(r/del);\n",
+    "\t\t\t\tatomicAdd(&d_g2[ig2],2) ;\n",
+    "\t\t\t}\n",
+    "\t\t}\n",
+    "\t}\n",
+    "}\n",
+    "''', 'pair_gpu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The Main Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def main():\n",
+    "   # start = timer()\n",
+    "    ########## Input Details ###########\n",
+    "    global xbox, ybox, zbox\n",
+    "    inconf = 10\n",
+    "    nbin   =np.int(2000)\n",
+    "    xbox   = np.float(0)\n",
+    "    ybox   =np.float(0)\n",
+    "    zbox   = np.float(0)\n",
+    "    \n",
+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
+    "    dataRoot = Path(fileDir).parents[1]\n",
+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
+    "    \n",
+    "    infile = DCDFile(file)\n",
+    "    pairfile = open(\"cupy_RDF.dat\", \"w+\")\n",
+    "    stwo = open(\"cupy_Pair_entropy.dat\", \"w+\")\n",
+    "\n",
+    "    numatm, nconf = dcdreadhead(infile)\n",
+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
+    "    if inconf > nconf:\n",
+    "        print(\"nconf is reset to {}\".format(nconf))\n",
+    "    else:\n",
+    "        nconf = inconf\n",
+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
+    "    #numatm = 100\n",
+    "    sizef =  nconf * numatm\n",
+    "    sizebin = nbin\n",
+    "\n",
+    "    ########### reading cordinates ##############\n",
+    "    nvtx.RangePush(\"Read_File\")\n",
+    "    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)\n",
+    "    nvtx.RangePop()  # pop for reading file\n",
+    "    print(\"Reading of input file is completed\")\n",
+    "    ############# Stream from Host to Device #########################\n",
+    "    d_x = cp.asarray(d_x)\n",
+    "    d_y = cp.asarray(d_y)\n",
+    "    d_z = cp.asarray(d_z)\n",
+    "    d_g2 = cp.zeros(sizebin, dtype=cp.int64)\n",
+    "\n",
+    "    ############################## RAW KERNEL #################################################\n",
+    "    nthreads = 128;\n",
+    "    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);\n",
+    "    nblock = (near2 / nthreads);\n",
+    "    print(\" Initial blocks are {} and now changing to\".format(nblock))\n",
+    "    maxblock = 65535\n",
+    "    blockloop = int(nblock / maxblock)\n",
+    "    if blockloop != 0:\n",
+    "        nblock = maxblock\n",
+    "    print(\"{} and will run over {} blockloops\".format(nblock, blockloop+1))\n",
+    "\n",
+    "    nvtx.RangePush(\"CuPy_Pair_gpu_Circulation\")\n",
+    "    #t1 = timer()\n",
+    "    for bl in range(blockloop+1):\n",
+    "        raw_kernel((nblock,),(nthreads,), (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)) ## cupy raw kernel\n",
+    "    \n",
+    "    cp.cuda.Device(0).synchronize()\n",
+    "    #print(\"Kernel compute time:\", timer() - t1)\n",
+    "    \n",
+    "    d_g2 = cp.asnumpy(d_g2)\n",
+    "    nvtx.RangePop()  # pop for Pair Calculation\n",
+    "    #############################################################################################\n",
+    "    pi = math.acos(np.long(-1.0))\n",
+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
+    "    norm = (np.long(4.0) * pi * rho) / np.long(3.0)\n",
+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
+    "    s2 =np.long(0.0); s2bond = np.long(0.0)\n",
+    "    lngrbond = np.float(0.0)\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del =box / (np.long(2.0) * nbin)\n",
+    "    gr = np.float(0.0)\n",
+    "    # loop to calculate entropy\n",
+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
+    "    for i in range(nbin):\n",
+    "        rl = (i) * _del\n",
+    "        ru = rl + _del\n",
+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
+    "        g2[i] = d_g2[i] / (nconf * numatm * nideal)\n",
+    "        r = (i) * _del\n",
+    "        temp = (i + 0.5) * _del\n",
+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
+    "\n",
+    "        if r < np.long(2.0):\n",
+    "            gr = np.long(0.0)\n",
+    "        else:\n",
+    "            gr = g2[i]\n",
+    "        if gr < 1e-5:\n",
+    "            lngr = np.long(0.0)\n",
+    "        else:\n",
+    "            lngr = math.log(gr)\n",
+    "        if g2[i] < 1e-6:\n",
+    "            lngrbond = np.long(0.0)\n",
+    "        else:\n",
+    "            lngrbond = math.log(g2[i])\n",
+    "        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)\n",
+    "        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r\n",
+    "\n",
+    "    nvtx.RangePop()  # pop for entropy Calculation\n",
+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
+    "    \n",
+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
+    "\n",
+    "    print(\"#Freeing Host memory\")\n",
+    "    del (d_x)\n",
+    "    del (d_y)\n",
+    "    del (d_z)\n",
+    "    del (d_g2)\n",
+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
+    "    #total_time = timer() - start\n",
+    "    #print(\"total time spent:\", total_time)\n",
+    "    \n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "### Output Files\n",
+    "\n",
+    "<img src=\"../images/output_files.png\"/>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Profiling\n",
+    "\n",
+    "<img src=\"../images/cupy_nsys1.png\"/>\n",
+    "<img src=\"../images/cupy_nsys3.png\"/>\n",
+    "\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "## Links and Resources\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Разница между файлами не показана из-за своего большого размера
+ 777 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/cupy_guide.ipynb


+ 413 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/cupy/serial_RDF.ipynb

@@ -0,0 +1,413 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# \n",
+    "\n",
+    "\n",
+    "# CuPy Lab 2:  Serial Code Lab Assignment\n",
+    "---\n",
+    "\n",
+    "#### [<<--CuPy Lab 1](cupy_guide.ipynb)\n",
+    "\n",
+    "\n",
+    "## A Recap on RDF\n",
+    "\n",
+    "- The radial distribution function (RDF) denoted in equations by g(r) defines the probability of finding a particle at a distance r from another tagged particle. \n",
+    "- The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids.\n",
+    "- It is observed the code complexity of the algorithm in 𝑁^2. Let us get into details of the accelerated code analysis. \n",
+    "- The input data for the serial code is fetched from a DCD binary trajectory file.\n",
+    "\n",
+    "\n",
+    "### The Serial Code\n",
+    "- The Cell below consist of two functions namely **dcdreadhead** and **dcdreadframe**\n",
+    "- The **dcdreadhead** function computes the total number of frames and atoms from the DCDFile **(input/alk.traj.dcd)**, while the **dcdreadframe** function reads 10 frames and 6720 atoms (note: each frame contains 6720 atoms) using the MDAnalysis library. \n",
+    "- Both functions run on the Host (CPU) and are being called from the function **main()**.\n",
+    "### <u>Cell 1</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import cupy as cp\n",
+    "import numpy as np\n",
+    "import math\n",
+    "import cupy.cuda.nvtx as nvtx\n",
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "\n",
+    "def dcdreadhead(infile):\n",
+    "    nconf   = infile.n_frames\n",
+    "    _infile = infile.header\n",
+    "    numatm  = _infile['natoms']\n",
+    "    return numatm, nconf\n",
+    "\n",
+    "def dcdreadframe(infile, numatm, nconf):\n",
+    "\n",
+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "\n",
+    "    for i in range(nconf):\n",
+    "        data = infile.readframes(i, i+1)\n",
+    "        box = data[1]\n",
+    "        atomset = data[0][0]\n",
+    "        xbox = round(box[0][0], 8)\n",
+    "        ybox = round(box[0][2],8)\n",
+    "        zbox = round(box[0][5], 8)\n",
+    "\n",
+    "        for row in range(numatm):\n",
+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
+    "\n",
+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "##  pair_gpu function\n",
+    "\n",
+    "- The pair_gpu is the function where the main task of the RDF serial implementation is being executed. The function computes differences in xyz DCD frames.\n",
+    "- The essence of njit(just-in-time) decorator is to get pair_gpu function to compile under no python mode, and this is really important for good performance. \n",
+    "- The decorator **@njit** or **@jit(nopython=True)** ensures that an exception is raised when compilation fails as a way of to alert that a bug is found within the decorated function. You can read more [here](https://numba.pydata.org/numba-doc/latest/user/performance-tips.html).\n",
+    "\n",
+    "### <u>Cell 2</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import njit\n",
+    "\n",
+    "@njit()\n",
+    "def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del = box / (2.0 * d_bin)\n",
+    "    cut = box * 0.5\n",
+    "\n",
+    "    for frame in range(nconf):\n",
+    "       # print(\"\\n {}\".format(frame))\n",
+    "        for id1 in range(numatm):\n",
+    "            for id2 in range(numatm):\n",
+    "                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]\n",
+    "                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]\n",
+    "                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]\n",
+    "                dx = dx - xbox * (round(dx / xbox))\n",
+    "                dy = dy - ybox * (round(dy / ybox))\n",
+    "                dz = dz - zbox * (round(dz / zbox))\n",
+    "\n",
+    "                r = math.sqrt(dx * dx + dy * dy + dz * dz)\n",
+    "                if r < cut :\n",
+    "                    ig2  = int((r/_del))\n",
+    "                    d_g2[ig2] = d_g2[ig2] + 1\n",
+    "\n",
+    "    return d_g2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Brief Analysis on internal task performed within pair_gpu function\n",
+    "- The graphic below identifies the various operations executed in the pair_gpu function. This function executes three nested-loops using tricky indexing manipulation within the arrays.\n",
+    "\n",
+    "\n",
+    "<img src=\"../images/pair_gpu.png\" width=\"80%\"/>\n",
+    "\n",
+    "- The indexing flow for the operation 1 is simulated using the graphic below. Each green box simualate the subtraction operation within the two inner loops (id1 & id2) while the indexes written in blue signifies the outer-most loop (frame) which iterates 10 times. \n",
+    "\n",
+    "<img src=\"../images/pair_gpu_analysis.png\" width=\"80%\"/>\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "### The Main Function\n",
+    "- This is the entry point of the program where every other functions including the **pair_gpu** function are called. The output of the main function is written into two files. An image version of the output files (\"**cupy_RDF.dat**\" & \"**cupy_Pair_entropy.dat**\") are displayed below the code cell.\n",
+    "\n",
+    "### <u>Cell 3</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def main():\n",
+    "    start = timer()\n",
+    "    ########## Input Details ###########\n",
+    "    inconf = 10\n",
+    "    nbin   = 2000\n",
+    "    global xbox, ybox, zbox\n",
+    "    \n",
+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
+    "    dataRoot = Path(fileDir).parents[1]\n",
+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
+    "    \n",
+    "    infile = DCDFile(file)\n",
+    "    pairfile = open(\"RDF.dat\", \"w+\")\n",
+    "    stwo     = open(\"Pair_entropy.dat\", \"w+\")\n",
+    "\n",
+    "    numatm, nconf = dcdreadhead(infile)\n",
+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
+    "    if inconf > nconf:\n",
+    "        print(\"nconf is reset to {}\".format(nconf))\n",
+    "    else:\n",
+    "        nconf = inconf\n",
+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
+    "    #numatm = 50\n",
+    "    sizef =  nconf * numatm\n",
+    "    sizebin = nbin\n",
+    "    ########### reading cordinates ##############\n",
+    "    nvtx.RangePush(\"Read_File\")\n",
+    "    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)\n",
+    "    nvtx.RangePop() # pop for reading file\n",
+    "\n",
+    "    h_g2 = np.zeros(sizebin, dtype=np.longlong)\n",
+    "    print(\"Reading of input file is completed\")\n",
+    "   \n",
+    "    print(\"\\n {} {}\".format(nconf, numatm))\n",
+    "    ############# This where we will concentrate #########################\n",
+    "    nvtx.RangePush(\"Pair_Circulation\")\n",
+    "    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)\n",
+    "    nvtx.RangePop() #pop for Pair Calculation\n",
+    "    ######################################################################\n",
+    "    \n",
+    "    pi = math.acos(np.long(-1.0))\n",
+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
+    "    norm = (np.long(4.0) * pi * rho) / np.long(3.0)\n",
+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
+    "    s2 = np.long(0.0);\n",
+    "    s2bond = np.long(0.0)\n",
+    "    lngrbond = np.float(0.0)\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del = box / (np.long(2.0) * nbin)\n",
+    "    gr = np.float(0.0)\n",
+    "    # loop to calculate entropy\n",
+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
+    "    for i in range(nbin):\n",
+    "        rl = (i) * _del\n",
+    "        ru = rl + _del\n",
+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
+    "        g2[i] = h_g2[i] / (nconf * numatm * nideal)\n",
+    "        r = (i) * _del\n",
+    "        temp = (i + 0.5) * _del\n",
+    "        \n",
+    "        #writing to file\n",
+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
+    "\n",
+    "        if r < np.long(2.0):\n",
+    "            gr = np.long(0.0)\n",
+    "        else:\n",
+    "            gr = g2[i]\n",
+    "        if gr < 1e-5:\n",
+    "            lngr = np.long(0.0)\n",
+    "        else:\n",
+    "            lngr = math.log(gr)\n",
+    "        if g2[i] < 1e-6:\n",
+    "            lngrbond = np.long(0.0)\n",
+    "        else:\n",
+    "            lngrbond = math.log(g2[i])\n",
+    "        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)\n",
+    "        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r\n",
+    "\n",
+    "    nvtx.RangePop() # pop for entropy Calculation\n",
+    "    \n",
+    "    #writing s2 and s2bond to file\n",
+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
+    "    \n",
+    "    # printing s2 and s2bond to jupyter output\n",
+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
+    "\n",
+    "    print(\"#Freeing Host memory\")\n",
+    "    del(h_x)\n",
+    "    del(h_y)\n",
+    "    del(h_z)\n",
+    "    del(h_g2)\n",
+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
+    "    \n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    #main()  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "###  Output Files\n",
+    "<table>\n",
+    "    <tr>\n",
+    "    <td>\n",
+    "         <img src=\"../images/serial_output_file.png\" width=\"95%\" />\n",
+    "    </td>\n",
+    "    </tr>\n",
+    "</table>\n",
+    "\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# Lab Task \n",
+    "\n",
+    "1. **Run the serial code from cell 1, 2, & 3**\n",
+    "    - Remove the **\"#\"** behind the **main()** before running the cell 3:\n",
+    "    ```python\n",
+    "       if __name__ == \"__main__\":\n",
+    "                main()\n",
+    "    ```\n",
+    "2. **Now, lets start modifying the original code to CuPy code constructs.**\n",
+    "> From the top menu, click on File, and Open **nways_serial.py** from the current directory at **Python/source_code/cupy** directory. Remember to SAVE your code after changes, before running below cells. \n",
+    "> Hints: focus on the **pair_gpu** function and you may as well need to modify few lines in the **main** function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run ../../source_code/serial/nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output should be the following:\n",
+    "\n",
+    "```\n",
+    "s2 value is -2.43191\n",
+    "s2bond value is -3.87014\n",
+    "```\n",
+    "\n",
+    "3. **Profile the code by running the cell bellow** "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To view the profiler report, you would need to [Download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the GUI. A sample expected profile report should is shown below.\n",
+    "\n",
+    "<img src=\"../images/cupy_nsys1.png\"/>\n",
+    "<img src=\"../images/cupy_nsys3.png\"/>\n",
+    "\n",
+    "From the profile report, we can see that the pair_gpu function now takes miliseconds to run as compared to the serial version which takes more than 3 seconds as shown [here](../serial/rdf_overview.ipynb). \n",
+    " \n",
+    "\n",
+    "---\n",
+    "### [View Solution](../../source_code/cupy/cupy_rdf.py)\n",
+    "---\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Post-Lab Summary\n",
+    "\n",
+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "cd ..\n",
+    "rm -f nways_files.zip\n",
+    "zip -r nways_files.zip *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
+    "\n",
+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "## Links and Resources\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_array.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/2d_col_mult.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cuda_cupy.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy.JPG


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_arch.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_intro.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_kernel_memory.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys2.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_nsys3.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/cupy_summary.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/dcdfile.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_block.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/matrix_grid.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/memory_architecture.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_nsys2.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_output_files.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/numba_summary1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/output_files.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/pair_gpu_analysis.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rapids_package.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/raw_kernel.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/rdf.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cpu_rdf2.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_cupy_profile.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_numba_profile.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_output_file.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profile.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/serial_profiler1.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.JPG


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_blocks.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/thread_position.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/ufunc.png


BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/images/workflow.png


+ 312 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_RDF.ipynb

@@ -0,0 +1,312 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# \n",
+    "\n",
+    "\n",
+    "# Numba Lab 3: Solution\n",
+    "---\n",
+    "\n",
+    "#### [<<-- Numba Lab 2](serial_RDF.ipynb)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import math\n",
+    "import cupy.cuda.nvtx as nvtx\n",
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "from timeit import default_timer as timer\n",
+    "import numba.cuda as cuda\n",
+    "\n",
+    "\n",
+    "def dcdreadhead(infile):\n",
+    "    nconf   = infile.n_frames\n",
+    "    _infile = infile.header\n",
+    "    numatm  = _infile['natoms']\n",
+    "    return numatm, nconf\n",
+    "\n",
+    "def dcdreadframe(infile, numatm, nconf):\n",
+    "\n",
+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "\n",
+    "    for i in range(nconf):\n",
+    "        data = infile.readframes(i, i+1)\n",
+    "        box = data[1]\n",
+    "        atomset = data[0][0]\n",
+    "        xbox = round(box[0][0], 8)\n",
+    "        ybox = round(box[0][2],8)\n",
+    "        zbox = round(box[0][5], 8)\n",
+    "\n",
+    "        for row in range(numatm):\n",
+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
+    "\n",
+    "    return xbox, ybox, zbox, d_x, d_y, d_z\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "### The Numba CUDA-jit  pair_gpu  acceleration code "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def pair_gpu_kernel(d_x, d_y,d_z, d_g2, numatm, nconf, xbox, ybox,zbox,d_bin, bl):\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del= box / (2.0 * d_bin)\n",
+    "    cut = box * 0.5;\n",
+    "    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x\n",
+    "    maxi = min(int(0.5 * numatm * (numatm - 1) - (bl * 65535 * 128)), (65535 * 128))\n",
+    "\n",
+    "    if i < maxi:\n",
+    "        thisi=bl * 65535 * 128+i\n",
+    "        n = (0.5) * (1+ ( math.sqrt (1.0+4.0 * 2.0 * thisi)))\n",
+    "        id1 = int(n)\n",
+    "        id2 = thisi-(0.5 * id1 * (id1-1))\n",
+    "        for frame in range(0, nconf):\n",
+    "            t1 = int(frame * numatm+id1)\n",
+    "            t2 = int(frame * numatm+id2)\n",
+    "            dx = d_x[t1] - d_x[t2]\n",
+    "            dy = d_y[t1] - d_y[t2]\n",
+    "            dz = d_z[t1] - d_z[t2]\n",
+    "            dx = dx - xbox * (round(dx / xbox))\n",
+    "            dy = dy - ybox * (round(dy / ybox))\n",
+    "            dz = dz - zbox * (round(dz / zbox))\n",
+    "\n",
+    "            r= math.sqrt(dx * dx+dy * dy+dz * dz)\n",
+    "            if r < cut:\n",
+    "                ig2=(int)(r / _del )\n",
+    "                cuda.atomic.add(d_g2, ig2, 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### The Main Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def main():\n",
+    "    #start = timer()\n",
+    "    ########## Input Details ###########\n",
+    "    global xbox, ybox, zbox\n",
+    "    inconf = 10\n",
+    "    nbin   =np.int(2000)\n",
+    "    xbox   = np.float(0)\n",
+    "    ybox   =np.float(0)\n",
+    "    zbox   = np.float(0)\n",
+    "    \n",
+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
+    "    dataRoot = Path(fileDir).parents[1]\n",
+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
+    "    \n",
+    "    infile = DCDFile(file)\n",
+    "\n",
+    "    pairfile = open(\"numba_RDF.dat\", \"w+\")\n",
+    "    stwo = open(\"numba_Pair_entropy.dat\", \"w+\")\n",
+    "\n",
+    "    numatm, nconf = dcdreadhead(infile)\n",
+    "\n",
+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
+    "    if inconf > nconf:\n",
+    "        print(\"nconf is reset to {}\".format(nconf))\n",
+    "    else:\n",
+    "        nconf = inconf\n",
+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
+    "\n",
+    "    #numatm = 100\n",
+    "    sizef =  nconf * numatm\n",
+    "    sizebin = nbin\n",
+    "\n",
+    "    ########### reading cordinates ##############\n",
+    "    nvtx.RangePush(\"Read_File\")\n",
+    "    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)\n",
+    "    nvtx.RangePop()  # pop for reading file\n",
+    "    print(\"Reading of input file is completed\")\n",
+    "\n",
+    "    ############################## Numba KERNEL #################################################\n",
+    "\n",
+    "    nthreads = 128;\n",
+    "    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);\n",
+    "    nblock = (near2 / nthreads);\n",
+    "    print(\" Initial blocks are {} and now changing to\".format(nblock))\n",
+    "    maxblock = 65535\n",
+    "    blockloop = int(nblock / maxblock)\n",
+    "    if blockloop != 0:\n",
+    "        nblock = maxblock\n",
+    "    print(\"{} and will run over {} blockloops \\n\".format(nblock, blockloop+1))\n",
+    "\n",
+    "    # cp.cuda.runtime.memset(d_g2,0,sizebin)\n",
+    "    d_g2 = np.zeros(sizebin, dtype=np.int64)\n",
+    "    d_g2 = cuda.to_device(d_g2) #numba copy to device\n",
+    "\n",
+    "    nvtx.RangePush(\"Pair_Circulation_Numba\")\n",
+    "    #t1 = timer()\n",
+    "    for bl in range(blockloop+1):\n",
+    "        pair_gpu_kernel[nblock,nthreads ](d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)  ## numba jit kernel\n",
+    "    \n",
+    "    cuda.synchronize()\n",
+    "   # print(\"Kernel compute time:\", timer() - t1)\n",
+    "    d_g2  = d_g2.copy_to_host() ## numba copy to host\n",
+    "    nvtx.RangePop()  # pop for Pair Calculation\n",
+    "\n",
+    "    pi = math.acos(np.long(-1.0))\n",
+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
+    "    norm = (np.long(4.0) * pi * rho) / np.long(3.0)\n",
+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
+    "    s2 =np.long(0.0); s2bond = np.long(0.0)\n",
+    "    lngrbond = np.float(0.0)\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del =box / (np.long(2.0) * nbin)\n",
+    "    gr = np.float(0.0)\n",
+    "    # loop to calculate entropy\n",
+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
+    "    for i in range(nbin):\n",
+    "        rl = (i) * _del\n",
+    "        ru = rl + _del\n",
+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
+    "        g2[i] = d_g2[i] / (nconf * numatm * nideal)\n",
+    "        r = (i) * _del\n",
+    "        temp = (i + 0.5) * _del\n",
+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
+    "\n",
+    "        if r < np.long(2.0):\n",
+    "            gr = np.long(0.0)\n",
+    "        else:\n",
+    "            gr = g2[i]\n",
+    "        if gr < 1e-5:\n",
+    "            lngr = np.long(0.0)\n",
+    "        else:\n",
+    "            lngr = math.log(gr)\n",
+    "        if g2[i] < 1e-6:\n",
+    "            lngrbond = np.long(0.0)\n",
+    "        else:\n",
+    "            lngrbond = math.log(g2[i])\n",
+    "        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)\n",
+    "        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r\n",
+    "\n",
+    "    nvtx.RangePop()  # pop for entropy Calculation\n",
+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
+    "    \n",
+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
+    "     \n",
+    "    print(\"#Freeing Host memory\")\n",
+    "    del (d_x)\n",
+    "    del (d_y)\n",
+    "    del (d_z)\n",
+    "    del (d_g2)\n",
+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
+    "    #total_time = timer() - start\n",
+    "    #print(\"total time spent:\", total_time)\n",
+    "    \n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Output Files\n",
+    "\n",
+    "<img src=\"../images/numba_output_files.png\"/>\n",
+    "\n",
+    "### Profiling Sample\n",
+    "\n",
+    "<img src=\"../images/numba_nsys1.png\"/>\n",
+    "<img src=\"../images/numba_nsys2.png\"/>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "# Links and Resources\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 599 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/numba_guide.ipynb

@@ -0,0 +1,599 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "# \n",
+    "\n",
+    "#  Numba Lab1: Numba For CUDA GPU\n",
+    "---\n",
+    "\n",
+    "Before we begin, let's execute the cell below to display information about the CUDA driver and GPUs running on the server by running the `nvidia-smi` command. To do this, execute the cell block below by giving it focus (clicking on it with your mouse), and hitting Ctrl-Enter, or pressing the play button in the toolbar above. If all goes well, you should see some output returned below the grey cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "### Learning Objectives\n",
+    "- **The goal of this lab is to:**\n",
+    "    -   quickly get you started with Numba from beginner to advanced level\n",
+    "    -   teach you application of CUDA GPU programming concept in HPC field(s)\n",
+    "    -   show you how to maximize the throughput of your HPC implementation through computational speedup on the GPU.  \n",
+    "     \n",
+    "\n",
+    "\n",
+    "##  Introduction\n",
+    "- Numba is a just-in-time (jit) compiler for Python that works best on code that uses NumPy arrays, functions, and loops. Numba has set of decorators that can be specified before user-defined functions to determine how they are compiled.  \n",
+    "- A decorated function written in python is compiled into CUDA kernel to speed up execution rate, thus, Numba supports CUDA GPU programming model. \n",
+    "- A kernel is written in Numba automatically have direct access to NumPy arrays. This implies a great support for data visiblilty between the host (CPU) and the device (GPU). \n",
+    "\n",
+    "\n",
+    "###  Definition of Terms\n",
+    "- The CPU is called a **Host**.  \n",
+    "- The GPU is called a **Device**.\n",
+    "- A GPU function launched by the host and executed on the device is called a **Kernels**.\n",
+    "- A GPU function executed on the device which can only be called from the device is called a **Device function**.\n",
+    "\n",
+    "### Note\n",
+    "- It is recommended to visit the NVIDIA official documentary web page and read through [CUDA C programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide), because most CUDA programming features exposed by Numba map directly to the CUDA C language offered by NVidia. \n",
+    "- Numba does not implement of these CUDA features of CUDA:\n",
+    "     - dynamic parallelism\n",
+    "     - texture memory\n",
+    "\n",
+    "## CUDA Kernel\n",
+    "- In CUDA, written code can be executed by hundreds or thousands of threads at a single run, hence, a solution is modeled after the following thread hierarchy: \n",
+    "    - **Grid**: A kernel executed as a collection of blocks \n",
+    "    - **Thread Block**: Collection of threads that can communicate via a shared memory. Each thread is executed by a core.\n",
+    "    - **Thread**: Single execution units that run kernels on GPU.\n",
+    "- Numba exposes three kinds of GPU memory: \n",
+    "    - global device memory  \n",
+    "    - shared memory \n",
+    "    - local memory. \n",
+    "- Memory access should be carefully considered in order to keep bandwidth contention at minimal.\n",
+    "\n",
+    " <img src=\"../images/thread_blocks.JPG\"/> <img src=\"../images/memory_architecture.png\"/> \n",
+    "\n",
+    "### Kernel Declaration\n",
+    "- A kernel function is a GPU function that is called from a CPU code by specifying the number of block threads and threads per block, and can not explicitly return a value except through a passed array. \n",
+    "- A kernel can be called multiple times with varying number of blocks per grid and threads per block after its has been compiled once.\n",
+    "\n",
+    "Example:\n",
+    "\n",
+    "```python\n",
+    "@cuda.jit\n",
+    "def arrayAdd(array_A, array_B, array_out):\n",
+    "    #...code body ...\n",
+    "```\n",
+    "###### Kernel Invocation\n",
+    "- A kernel is typically launched in the following way:\n",
+    "```python\n",
+    "threadsperblock = 128\n",
+    "N = array_out.size\n",
+    "blockspergrid = ( N + (threadsperblock - 1))// threadsperblock\n",
+    "arrayAdd[blockspergrid, threadsperblock](array_A, array_B, array_out)\n",
+    "```\n",
+    "\n",
+    "###### Choosing Block Size\n",
+    "- The block size determines how many threads share a given area of shared memory.\n",
+    "- The block size must be large enough to accommodate all computation units. See more details [here](https://docs.nvidia.com/cuda/cuda-c-programming-guide/).\n",
+    "\n",
+    "### Thread Positioning \n",
+    "- When running a kernel, the kernel function’s code is executed by every thread once. Hence is it important to uniquely identify distinct threads.\n",
+    "- The default way to determine a thread position in a grid and block is to manually compute the corresponding array position:\n",
+    "\n",
+    "\n",
+    "<img src=\"../images/thread_position.png\"/>\n",
+    "\n",
+    "\n",
+    "```python\n",
+    "threadsperblock = 128\n",
+    "N = array_out.size\n",
+    "\n",
+    "@cuda.jit\n",
+    "def arrayAdd(array_A, array_B, array_out):\n",
+    "    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "    if tid < N: #Check array boundaries\n",
+    "        array_out[tid] =  array_A[tid] + array_B[tid]\n",
+    "\n",
+    "#Unless you are sure the block size and grid size are a divisor of your array size, you must check boundaries as shown in the code block above. \n",
+    "```\n",
+    "### Example 1: Addition on 1D-Arrays\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "\n",
+    "N = 500000\n",
+    "threadsperblock = 1000\n",
+    "\n",
+    "@cuda.jit()\n",
+    "def arrayAdd(array_A, array_B, array_out):\n",
+    "    tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "    if tid < N:\n",
+    "        array_out[tid] = array_A[tid] + array_B[tid]\n",
+    "\n",
+    "\n",
+    "        \n",
+    "array_A = np.arange(N, dtype=np.int)\n",
+    "array_B = np.arange(N, dtype=np.int)\n",
+    "array_out = np.zeros(N, dtype=np.int)\n",
+    "\n",
+    "blockpergrid  = N + (threadsperblock - 1) // threadsperblock\n",
+    "\n",
+    "arrayAdd[blockpergrid, threadsperblock](array_A, array_B, array_out)\n",
+    "\n",
+    "print(\"result: {} \".format(array_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**From Example 1:** \n",
+    "> - N is the size of the array and the number of threads in a single block is 128.\n",
+    "> - The **cuda.jit()** decorator indicates that the function (arrayAdd) below is a device kernel and should run parallel. The **tid** is the estimate of a unique index for each thread in the device memory grid: \n",
+    ">> **tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x**.\n",
+    "> - **array_A** and **array_B** are input data, while **array_out** is output array and is already preload with zeros.\n",
+    "> - The statement **blockpergrid  = N + (threadsperblock - 1) // threadsperblock** Computes the size of block per grid. This line of code is commonly use as the default formular to estimate number of blocks per grid in several GPU programming documentations.\n",
+    "> - **arrayAdd[blockpergrid, threadsperblock](array_A, array_B, array_out)** indicate a call to a kernel function **addAdd** having the number of blocks per grid and number of threads per block in square bracket, while kernel arguments are in round brackets.\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "###  Matrix multiplication on 2D Array \n",
+    "\n",
+    "<img src=\"../images/2d_array.png\"/>\n",
+    "\n",
+    "<img src=\"../images/2d_col_mult.png\"/>\n",
+    "\n",
+    "> **Note**\n",
+    "> - **Approach 2** would not be possible if the matrix size exceed the maximum number of threads per block on the device, while **Approach 1** would continue to execute. Most latest GPUs have maximum of 1024 threads per thread block. \n",
+    "\n",
+    "### Example 2:  Matrix multiplication "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "import math\n",
+    "\n",
+    "N = 4\n",
+    "@cuda.jit()\n",
+    "def MatrixMul2D(array_A, array_B, array_out):\n",
+    "   row, col = cuda.grid(2)\n",
+    "   if row < array_out.shape[0] and col < array_out.shape[1]:\n",
+    "      for k in range(N):\n",
+    "         array_out[row][col]+= array_A[row][k] * array_B[k][col]\n",
+    "\n",
+    "\n",
+    "array_A   = np.array([[0,0,0,0],[1,1,1,1],[2,2,2,2],[3,3,3,3]], dtype=np.int32)\n",
+    "array_B   = np.array([[0,1,2,3],[0,1,2,3],[0,1,2,3],[0,1,2,3]], dtype=np.int32)\n",
+    "array_out = np.zeros(N*N, dtype=np.int32).reshape(N, N)\n",
+    "\n",
+    "threadsperblock = (2,2)\n",
+    "blockpergrid_x  = (math.ceil( N / threadsperblock[0]))\n",
+    "blockpergrid_y  = (math.ceil( N / threadsperblock[1]))\n",
+    "blockpergrid    = (blockpergrid_x, blockpergrid_y)\n",
+    "\n",
+    "MatrixMul2D[blockpergrid,threadsperblock](array_A, array_B, array_out)\n",
+    "\n",
+    "print(\"array_A:\\n {}\\n\".format(array_A))\n",
+    "print(\"array_B:\\n {}\\n\".format(array_B))\n",
+    "print(\"array_A * array_B:\\n {}\".format(array_out))\n",
+    "\n",
+    "#Note\n",
+    "#The cuda.grid() returns the thread ID in X and Y (row & col) direction of the memory grid\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exaample 3: A 225 × 225 Matrix Multiplication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = 225\n",
+    "\n",
+    "@cuda.jit()\n",
+    "def MatrixMul2D(array_A, array_B, array_out):\n",
+    "   x, y = cuda.grid(2)\n",
+    "   if x < array_out.shape[0] and y < array_out.shape[1]:\n",
+    "      for k in range(N):\n",
+    "         array_out[x][y] += array_A[x][k] * array_B[k][y]\n",
+    "\n",
+    "threadsperblock = (25,25)\n",
+    "array_A = np.arange((N*N), dtype=np.int32).reshape(N,N)\n",
+    "array_B = np.arange((N*N), dtype=np.int32).reshape(N,N)\n",
+    "array_out = np.zeros((N*N), dtype=np.int32).reshape(N,N)\n",
+    "\n",
+    "blockpergrid_x  = (math.ceil( N / threadsperblock[0]))\n",
+    "blockpergrid_y  = (math.ceil( N / threadsperblock[1]))\n",
+    "blockpergrid    = (blockpergrid_x, blockpergrid_y)\n",
+    "\n",
+    "MatrixMul2D[blockpergrid,threadsperblock](array_A, array_B, array_out)\n",
+    "\n",
+    "print(array_out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Thread reuse \n",
+    "\n",
+    "- It is possible to specify a few number of threads for a data size such that threads are reused to complete the computation of the entire data. This is one of the approach used when a data to be computed is larger than the maximum number of threads available in a device memory. \n",
+    "- This statement is used in a while loop: ***tid += cuda.blockDim.x * cuda.gridDim.x***\n",
+    "- An example is given below to illustrates thread reuse. In the example, small number of thread is specified on purpose in order to show the possibility of this approach. \n",
+    "\n",
+    "\n",
+    "#### Example 4: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "\n",
+    "N = 500000\n",
+    "threadsperblock = 1000\n",
+    "\n",
+    "@cuda.jit\n",
+    "def arrayAdd(array_A, array_B, array_out):\n",
+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "   while tid < N:\n",
+    "      array_out[tid] = array_A[tid] + array_B[tid]\n",
+    "      tid += cuda.blockDim.x * cuda.gridDim.x\n",
+    "\n",
+    "array_A = np.arange(N, dtype=np.int32)\n",
+    "array_B = np.arange(N, dtype=np.int32)\n",
+    "array_out = np.zeros(N, dtype=np.int32)\n",
+    "\n",
+    "arrayAdd[1, threadsperblock](array_A, array_B, array_out)\n",
+    "\n",
+    "print(\"result: {} \".format(array_out))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note**\n",
+    "> - The task in **example 4** is the same as in **example 1** but with limited number of threads specified, howbeit, the same result was achieved. \n",
+    "> - Note that this approach may delegate more threads than required. In the code above, an excess of 1 block of threads may be delegated.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Memory Management\n",
+    "\n",
+    "### Data Transfer \n",
+    "- When a kernel is excuted, Numba automatically transfer NumPy arrays to the device and vice versa.\n",
+    "- In order to avoid the unnecessary transfer for read-only arrays, the following APIs can be used to manually control the transfer.\n",
+    "\n",
+    "##### 1.  Copy host to device\n",
+    "```python\n",
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "\n",
+    "N = 500000\n",
+    "h_A = np.arange(N, dtype=np.int)\n",
+    "h_B = np.arange(N, dtype=np.int)\n",
+    "h_C = np.zeros(N, dtype=np.int)\n",
+    "\n",
+    "d_A = cuda.to_device(h_A)\n",
+    "d_B = cuda.to_device(h_B)\n",
+    "d_C = cuda.to_device(h_C)\n",
+    "```\n",
+    "##### 2.  Enqueue the transfer to a stream\n",
+    "```python\n",
+    "h_A    = np.arange(N, dtype=np.int)\n",
+    "stream = cuda.stream()\n",
+    "d_A    = cuda.to_device(h_A, stream=stream)\n",
+    "```\n",
+    "##### 3.  Copy device to host / enqueue the transfer to a stream \n",
+    "```python\n",
+    "h_C = d_C.copy_to_host()\n",
+    "h_C = d_C.copy_to_host(stream=stream)\n",
+    "```\n",
+    "### Example 5:  data movement "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "N = 200\n",
+    "threadsperblock = 25\n",
+    "\n",
+    "@cuda.jit\n",
+    "def arrayAdd(d_A, d_B, d_C):\n",
+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "   if tid < N:\n",
+    "      d_C[tid] = d_A[tid] + d_B[tid]\n",
+    "      \n",
+    "h_A = np.arange(N, dtype=np.int32)\n",
+    "h_B = np.arange(N, dtype=np.int32)\n",
+    "h_C = np.zeros(N, dtype=np.int32)\n",
+    "\n",
+    "d_A = cuda.to_device(h_A)\n",
+    "d_B = cuda.to_device(h_B)\n",
+    "d_C = cuda.to_device(h_C)\n",
+    "\n",
+    "blockpergrid  = N + (threadsperblock - 1) // threadsperblock\n",
+    "arrayAdd[blockpergrid, threadsperblock](d_A, d_B, d_C)\n",
+    "\n",
+    "h_C = d_C.copy_to_host()\n",
+    "print(h_C)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Atomic Operation\n",
+    "\n",
+    "- Atomic operation is required in a situation where multiple threads attempt to modify a common portion of the memory. \n",
+    "- Typical example includes: simultaneous withdrawal from a bank account through ATM machine or large number of threads modfying a particular index of an array based on certain condition(s)\n",
+    "- List of presently implemented atomic operations supported by Numba are:\n",
+    "> **import numba.cuda as cuda**\n",
+    "> - cuda.atomic.add(array, index, value)\n",
+    "> - cuda.atomic.min(array, index, value)\n",
+    "> - cuda.atomic.max(array, index, value)\n",
+    "> - cuda.atomic.nanmax(array, index, value)\n",
+    "> - cuda.atomic.nanmin(array, index, value)\n",
+    "> - cuda.atomic.compare_and_swap(array, old_value, current_value)\n",
+    "> - cuda.atomic.sub(array, index, value)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Task ==> sum of an array: [1,2,3,4,5,6,7,8,9,10] in parallel\n",
+    "# Note that threads are executed randomly\n",
+    "\n",
+    "# atomic operation example \n",
+    "size = 10\n",
+    "nthread = 10\n",
+    "@cuda.jit()\n",
+    "def add_atomic(my_array, total):\n",
+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "   cuda.atomic.add(total,0, my_array[tid])\n",
+    "\n",
+    "my_array = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.int32)\n",
+    "total = np.zeros(1, dtype=np.int32)\n",
+    "nblock = int(size / nthread)\n",
+    "add_atomic[nblock, nthread](my_array, total)\n",
+    "print(\"Atomic:\", total)\n",
+    "\n",
+    "######################################################################################\n",
+    "# Non-atomic operation example  \n",
+    "size = 10\n",
+    "nthread = 10\n",
+    "@cuda.jit()\n",
+    "def add_atomic(my_array, total):\n",
+    "   tid = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x\n",
+    "   total[0] += my_array[tid]\n",
+    "   \n",
+    "\n",
+    "my_array = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.int32)\n",
+    "total = np.zeros(1, dtype=np.int32)\n",
+    "nblock = int(size / nthread)\n",
+    "add_atomic[nblock, nthread](my_array, total)\n",
+    "print(\"Non atomic: \", total)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7. CUDA Ufuncs\n",
+    "\n",
+    "- The CUDA ufunc supports passing intra-device arrays to reduce traffic over the PCI-express bus. \n",
+    "- It also support asynchronous mode by using stream keyword.\n",
+    "\n",
+    "<img src=\"../images/ufunc.png\"/>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import vectorize\n",
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "\n",
+    "@vectorize(['float32(float32, float32)'],target='cuda')\n",
+    "def compute(a, b):\n",
+    "    return (a - b) * (a + b)\n",
+    "\n",
+    "N = 10000\n",
+    "A = np.arange(N , dtype=np.float32)\n",
+    "B = np.arange(N, dtype=np.float32)\n",
+    "C = compute(A, B)\n",
+    "\n",
+    "print(C.reshape(100,100))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Device function\n",
+    "\n",
+    "- The CUDA device functions can only be invoked from within the device and can return a value like normal functions. The device function is usually placed before the CUDA ufunc kernel otherwise a call to the device function may not be visible inside the ufunc kernel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import vectorize\n",
+    "import numba.cuda as cuda\n",
+    "import numpy as np\n",
+    "import math\n",
+    "\n",
+    "@cuda.jit('float32(float32)', device=True, inline=True)\n",
+    "def device_ufunc(c):\n",
+    "   return math.sqrt(c)\n",
+    "\n",
+    "@vectorize(['float32(float32, float32)'],target='cuda')\n",
+    "def compute(a, b):\n",
+    "    c = (a - b) * (a + b)\n",
+    "    return device_ufunc(c)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "<img src=\"../images/numba_summary1.png\"/>\n",
+    "\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Lab Task\n",
+    "\n",
+    "In this section, you are expected to click on the **Serial code Lab Assignment** link and proceed to Lab 2. In this lab you will find three python serial code functions. You are required to revise the **pair_gpu** function and make it run on the GPU, and likewise do a few modifications on the **main** function.\n",
+    "\n",
+    "## <div style=\"text-align:center; color:#FF0000; border:3px solid red;height:80px;\"> <b><br/> [Serial Code Lab Assignment](serial_RDF.ipynb) </b> </div>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Post-Lab Summary\n",
+    "\n",
+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "cd ..\n",
+    "rm -f nways_files.zip\n",
+    "zip -r nways_files.zip *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
+    "\n",
+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "# Links and Resources\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "## References\n",
+    "\n",
+    "- Numba Documentation, Release 0.52.0-py3.7-linux-x86_64.egg, Anaconda, Nov 30, 2020.\n",
+    "- Bhaumik Vaidya, Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA, Packt Publishing, 2018.\n",
+    "- https://docs.nvidia.com/cuda/cuda-c-programming-guide/\n",
+    "\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Raw Cell Format",
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 411 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/numba/serial_RDF.ipynb

@@ -0,0 +1,411 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# \n",
+    "\n",
+    "\n",
+    "# Numba Lab 2: HPC Approach with Serial Code\n",
+    "---\n",
+    "\n",
+    "#### [<<--Numba Lab 1](numba_guide.ipynb)\n",
+    "\n",
+    "\n",
+    "## A Recap on RDF\n",
+    "\n",
+    "- The radial distribution function (RDF) denoted in equations by g(r) defines the probability of finding a particle at a distance r from another tagged particle. \n",
+    "- The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids.\n",
+    "- It is observed the code complexity of the algorithm in 𝑁^2. Let us get into details of the accelerated code analysis. \n",
+    "- The input data for the serial code is fetched from a DCD binary trajectory file.\n",
+    "\n",
+    "\n",
+    "### The Serial\n",
+    "- The Cell below consist of two functions namely **dcdreadhead** and **dcdreadframe**\n",
+    "- The **dcdreadhead** function computes the total number of frames and atoms from the DCDFile **(input/alk.traj.dcd)**, while the **dcdreadframe** function reads 10 frames and 6720 atoms (note: each frame contains 6720 atoms) using the MDAnalysis library. \n",
+    "- Both functions run on the Host (CPU) and are being called from the function **main()**.\n",
+    "\n",
+    "### <u>Cell 1</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import cupy as cp\n",
+    "import numpy as np\n",
+    "import math\n",
+    "import cupy.cuda.nvtx as nvtx\n",
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "\n",
+    "def dcdreadhead(infile):\n",
+    "    nconf   = infile.n_frames\n",
+    "    _infile = infile.header\n",
+    "    numatm  = _infile['natoms']\n",
+    "    return numatm, nconf\n",
+    "\n",
+    "def dcdreadframe(infile, numatm, nconf):\n",
+    "\n",
+    "    d_x = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_y = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "    d_z = np.zeros(numatm * nconf, dtype=np.float64)\n",
+    "\n",
+    "    for i in range(nconf):\n",
+    "        data = infile.readframes(i, i+1)\n",
+    "        box = data[1]\n",
+    "        atomset = data[0][0]\n",
+    "        xbox = round(box[0][0], 8)\n",
+    "        ybox = round(box[0][2],8)\n",
+    "        zbox = round(box[0][5], 8)\n",
+    "\n",
+    "        for row in range(numatm):\n",
+    "            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column\n",
+    "            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column\n",
+    "            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column\n",
+    "\n",
+    "    return xbox, ybox, zbox, d_x, d_y, d_z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  pair_gpu function\n",
+    "\n",
+    "- The pair_gpu is the function where the main task of the RDF serial implementation is being executed. The function computes differences in xyz DCD frames. \n",
+    "- The essence of njit(just-in-time) decorator is to get pair_gpu function to compile under no python mode, and this is really important for good performance. \n",
+    "- The decorator **@njit** or **@jit(nopython=True)** ensures that an exception is raised when compilation fails as a way of to alert that a bug is found within the decorated function. You can read more [here](https://numba.pydata.org/numba-doc/latest/user/performance-tips.html).\n",
+    "\n",
+    "### <u>Cell 2</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import njit\n",
+    "\n",
+    "@njit()\n",
+    "def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del = box / (2.0 * d_bin)\n",
+    "    cut = box * 0.5\n",
+    "    #print(\"\\n {} {}\".format(nconf, numatm))\n",
+    "\n",
+    "    for frame in range(nconf):\n",
+    "        #print(\"\\n {}\".format(frame))\n",
+    "        for id1 in range(numatm):\n",
+    "            for id2 in range(numatm):\n",
+    "                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]\n",
+    "                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]\n",
+    "                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]\n",
+    "                dx = dx - xbox * (round(dx / xbox))\n",
+    "                dy = dy - ybox * (round(dy / ybox))\n",
+    "                dz = dz - zbox * (round(dz / zbox))\n",
+    "\n",
+    "                r = math.sqrt(dx * dx + dy * dy + dz * dz)\n",
+    "                if r < cut :\n",
+    "                    ig2  = int((r/_del))\n",
+    "                    d_g2[ig2] = d_g2[ig2] + 1\n",
+    "\n",
+    "    return d_g2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Brief Analysis on internal task performed within pair_gpu function\n",
+    "- The graphic below identifies the various operations executed in the pair_gpu function. This function executes three nested-loops using tricky indexing manipulation within the arrays.\n",
+    "\n",
+    "\n",
+    "<img src=\"../images/pair_gpu.png\" width=\"80%\"/>\n",
+    "\n",
+    "- The indexing flow for the operation 1 is simulated using the graphic below. Each green box simualate the subtraction operation within the two inner loops (id1 & id2) while the indexes written in blue signifies the outer-most loop (frame) which iterates 10 times. \n",
+    "\n",
+    "<img src=\"../images/pair_gpu_analysis.png\" width=\"80%\"/>\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "### The Main Function\n",
+    "- This is the entry point of the program where every other functions including the **pair_gpu** function are called. The output of the main function is written into two files. An image version of the output files (\"**cupy_RDF.dat**\" & \"**cupy_Pair_entropy.dat**\") are displayed below the code cell.\n",
+    "\n",
+    "\n",
+    "### <u>Cell 3</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from MDAnalysis.lib.formats.libdcd import DCDFile\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def main():\n",
+    "    \n",
+    "    ########## Input Details ###########\n",
+    "    inconf = 10\n",
+    "    nbin   = 2000\n",
+    "    global xbox, ybox, zbox\n",
+    "    \n",
+    "    fileDir = os.path.dirname(os.path.realpath('__file__'))\n",
+    "    dataRoot = Path(fileDir).parents[1]\n",
+    "    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')\n",
+    "    \n",
+    "    infile = DCDFile(file)\n",
+    "    pairfile = open(\"RDF.dat\", \"w+\")\n",
+    "    stwo     = open(\"Pair_entropy.dat\", \"w+\")\n",
+    "\n",
+    "    numatm, nconf = dcdreadhead(infile)\n",
+    "    print(\"Dcd file has {} atoms and {} frames\".format(numatm, nconf))\n",
+    "    if inconf > nconf:\n",
+    "        print(\"nconf is reset to {}\".format(nconf))\n",
+    "    else:\n",
+    "        nconf = inconf\n",
+    "    print(\"Calculating RDF for {} frames\".format(nconf))\n",
+    "    #numatm = 50\n",
+    "    sizef =  nconf * numatm\n",
+    "    sizebin = nbin\n",
+    "    ########### reading cordinates ##############\n",
+    "    nvtx.RangePush(\"Read_File\")\n",
+    "    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)\n",
+    "    nvtx.RangePop() # pop for reading file\n",
+    "\n",
+    "    h_g2 = np.zeros(sizebin, dtype=np.longlong)\n",
+    "    print(\"Reading of input file is completed\")\n",
+    "    print(\"\\n {} {}\".format(nconf, numatm))\n",
+    "    ############# This where we will concentrate #########################\n",
+    "    nvtx.RangePush(\"Pair_Circulation\")\n",
+    "    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)\n",
+    "    nvtx.RangePop() #pop for Pair Calculation\n",
+    "    ######################################################################\n",
+    "    \n",
+    "    pi = math.acos(np.long(-1.0))\n",
+    "    rho = (numatm) / (xbox * ybox * zbox)\n",
+    "    norm = (np.long(4.0) * pi * rho) / np.long(3.0)\n",
+    "    g2 = np.zeros(nbin, dtype=np.float32)\n",
+    "    s2 = np.long(0.0);\n",
+    "    s2bond = np.long(0.0)\n",
+    "    lngrbond = np.float(0.0)\n",
+    "    box = min(xbox, ybox)\n",
+    "    box = min(box, zbox)\n",
+    "    _del = box / (np.long(2.0) * nbin)\n",
+    "    gr = np.float(0.0)\n",
+    "    # loop to calculate entropy\n",
+    "    nvtx.RangePush(\"Entropy_Calculation\")\n",
+    "    for i in range(nbin):\n",
+    "        rl = (i) * _del\n",
+    "        ru = rl + _del\n",
+    "        nideal = norm * (ru * ru * ru - rl * rl * rl)\n",
+    "        g2[i] = h_g2[i] / (nconf * numatm * nideal)\n",
+    "        r = (i) * _del\n",
+    "        temp = (i + 0.5) * _del\n",
+    "        \n",
+    "        #writing to file\n",
+    "        pairfile.write(str(temp) + \" \" + str(g2[i]) + \"\\n\")\n",
+    "\n",
+    "        if r < np.long(2.0):\n",
+    "            gr = np.long(0.0)\n",
+    "        else:\n",
+    "            gr = g2[i]\n",
+    "        if gr < 1e-5:\n",
+    "            lngr = np.long(0.0)\n",
+    "        else:\n",
+    "            lngr = math.log(gr)\n",
+    "        if g2[i] < 1e-6:\n",
+    "            lngrbond = np.long(0.0)\n",
+    "        else:\n",
+    "            lngrbond = math.log(g2[i])\n",
+    "        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)\n",
+    "        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r\n",
+    "\n",
+    "    nvtx.RangePop() # pop for entropy Calculation\n",
+    "    \n",
+    "    #writing s2 and s2bond to file\n",
+    "    stwo.writelines(\"s2 value is {}\\n\".format(s2))\n",
+    "    stwo.writelines(\"s2bond value is {}\".format(s2bond))\n",
+    "    \n",
+    "    # printing s2 and s2bond to jupyter output\n",
+    "    print(\"\\n s2 value is {}\\n\".format(s2))\n",
+    "    print(\"s2bond value is {}\\n\".format(s2bond))\n",
+    "\n",
+    "    print(\"#Freeing Host memory\")\n",
+    "    del(h_x)\n",
+    "    del(h_y)\n",
+    "    del(h_z)\n",
+    "    del(h_g2)\n",
+    "    print(\"#Number of atoms processed: {}  \\n\".format(numatm))\n",
+    "    print(\"#number of confs processed: {} \\n\".format(nconf))\n",
+    "    \n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "### Console Output and Output Files\n",
+    "<table>\n",
+    "    <tr>\n",
+    "    <td>\n",
+    "         <img src=\"../images/serial_output_file.png\" width=\"95%\" />\n",
+    "    </td>\n",
+    "    </tr>\n",
+    "</table>\n",
+    "\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Lab Task \n",
+    "\n",
+    "1. 1. **Run the serial code from cell 1, 2, & 3**\n",
+    "    - Remove the **\"#\"** behind the **main()** before running the cell 3:\n",
+    "    ```python\n",
+    "       if __name__ == \"__main__\":\n",
+    "                main()\n",
+    "    ```\n",
+    "2. **Now, lets start modifying the original code to Numba code constructs.**\n",
+    "> From the top menu, click on File, and Open **nways_serial.py** from the current directory at **Python/source_code/numba** directory. Remember to SAVE your code after changes, before running below cells. \n",
+    "> Hints: focus on the **pair_gpu** function and you may as well need to modify few lines in the **main** function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run ../../source_code/serial/nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output should be the following:\n",
+    "\n",
+    "```\n",
+    "s2 value is -2.43191\n",
+    "s2bond value is -3.87014\n",
+    "```\n",
+    "\n",
+    "3. **Profile the code by running the cell bellow** "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To view the profiler report, you would need to [Download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the GUI. A sample expected profile report is given below.\n",
+    "\n",
+    "<img src=\"../images/numba_nsys1.png\"/>\n",
+    "<img src=\"../images/numba_nsys2.png\"/>\n",
+    "\n",
+    "From the profile report, we can see that the pair_gpu function now takes miliseconds to run as compared to the serial version which takes more than 3 seconds as shown [here](../serial/rdf_overview.ipynb). \n",
+    "\n",
+    "---\n",
+    "### [View Solution](../../source_code/numba/numba_rdf.py)\n",
+    "--- \n",
+    "\n",
+    "## Post-Lab Summary\n",
+    "\n",
+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "cd ..\n",
+    "rm -f nways_files.zip\n",
+    "zip -r nways_files.zip *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "**After** executing the above zip command, you should be able to download the zip file [here](../nways_files.zip).\n",
+    "\n",
+    "**IMPORTANT**: Please click on **HOME** to go back to the main notebook for *N ways of GPU programming for MD* code.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../../nways_MD_start.ipynb>HOME</a></p>\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "# Links and Resources\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 130 - 0
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/rdf_overview.ipynb

@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RDF\n",
+    "The radial distribution function (RDF) denoted in equations by g(r) defines the probability of finding a particle at a distance r from another tagged particle. The RDF is strongly dependent on the type of matter so will vary greatly for solids, gases and liquids.\n",
+    "<img src=\"../images/rdf.png\" width=\"50%\" height=\"50%\">\n",
+    " The radial distribution function (RDF) denoted in equations by g(r) defines the probability of finding a particle at a distance r from another tagged particle. \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you might have observed the code complexity of the algorithm in $N^{2}$ . Let us get into details of the sequential code. **Understand and analyze** the code present at:\n",
+    "\n",
+    "[RDF Serial Code](../../source_code/serial/nways_serial.py)\n",
+    "\n",
+    "\n",
+    "Open the downloaded file for inspection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%run ../../source_code/serial/nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We plan to follow the typical optimization cycle that every code needs to go through\n",
+    "<img src=\"../images/workflow.png\" width=\"70%\" height=\"70%\">\n",
+    "\n",
+    "In order analyze the application we we will make use of profiler \"nsys\" and add \"nvtx\" marking into the code to get more information out of the serial code. Before running the below cells, let's first start by divining into the profiler lab to learn more about the tools. Using Profiler gives us the hotspots and helps to understand which function is important to be made parallel.\n",
+    "\n",
+    "-----\n",
+    "\n",
+    "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[Profiling lab](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb)</div> \n",
+    "\n",
+    "-----\n",
+    "\n",
+    "Now, that we are familiar with the Nsight Profiler and know how to [NVTX](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb#nvtx), let's profile the serial code and checkout the output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../../source_code/serial&& nsys profile --stats=true --force-overwrite true -o serial_cpu_rdf python3 nways_serial.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you run the above cell, you should see the following in the terminal.\n",
+    "\n",
+    "<img src=\"../images/serial_cpu_rdf1.png\" width=\"700px\" height=\"600px\"/>\n",
+    "<img src=\"../images/serial_cpu_rdf2.png\" width=\"700px\" height=\"400px\"/>\n",
+    "\n",
+    "To view the profiler report, you would need to [Download the profiler output](../../source_code/serial/serial_cpu_rdf.qdrep) and open it via the GUI. For more information on how to open the report via the GUI, please checkout the section on [How to view the report](../../../../../profiler/English/jupyter_notebook/profiling-c.ipynb#gui-report). \n",
+    "\n",
+    "From the timeline view, right click on the nvtx row and click the \"show in events view\". Now you can see the nvtx statistic at the bottom of the window which shows the duration of each range. In the following labs, we will look in to the profiler report in more detail. \n",
+    "\n",
+    "<img src=\"../images/serial_profile.png\" width=\"100%\" height=\"100%\"/>\n",
+    "\n",
+    "The obvious next step is to make **Pair Calculation** algorithm parallel using different approaches to GPU Programming. Please follow the below link and choose one of the approaches to parallelise th serial code.\n",
+    "\n",
+    "-----\n",
+    "\n",
+    "# <div style=\"text-align: center ;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\">[HOME](../../../nways_MD_start.ipynb)</div> \n",
+    "-----\n",
+    "\n",
+    "\n",
+    "# Links and Resources\n",
+    "<!--[OpenACC API guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)-->\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "<!--[NVIDIA Nsight Compute](https://developer.nvidia.com/nsight-compute)-->\n",
+    "\n",
+    "<!--[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)-->\n",
+    "\n",
+    "[Profiling timelines with NVTX](https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

BIN
hpc/nways/nways_labs/nways_MD/English/Python/jupyter_notebook/serial/serial_cpu_rdf.qdrep


+ 210 - 0
hpc/nways/nways_labs/nways_MD/English/Python/source_code/cupy/cupy_rdf.py

@@ -0,0 +1,210 @@
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
+
+import cupy as cp
+import numpy as np
+import math
+import cupy.cuda.nvtx as nvtx
+from MDAnalysis.lib.formats.libdcd import DCDFile
+from timeit import default_timer as timer
+import os
+from pathlib import Path
+
+#pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
+#cp.cuda.set_allocator(pool.malloc)
+
+
+def dcdreadhead(infile):
+    nconf   = infile.n_frames
+    _infile = infile.header
+    numatm  = _infile['natoms']
+    return numatm, nconf
+
+def dcdreadframe(infile, numatm, nconf):
+
+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
+
+    for i in range(nconf):
+        data = infile.readframes(i, i+1)
+        box = data[1]
+        atomset = data[0][0]
+        xbox = round(box[0][0], 8)
+        ybox = round(box[0][2],8)
+        zbox = round(box[0][5], 8)
+
+        for row in range(numatm):
+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
+        
+    return xbox, ybox, zbox, d_x, d_y, d_z
+
+
+def main():
+    start = timer()
+    ########## Input Details ###########
+    global xbox, ybox, zbox
+    inconf = 10
+    nbin   =np.int(2000)
+    xbox   = np.float(0)
+    ybox   =np.float(0)
+    zbox   = np.float(0)
+
+    ########use on jupyter notebook#######
+    fileDir = os.path.dirname(os.path.realpath('__file__'))
+    dataRoot = Path(fileDir).parents[1]
+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
+
+    ########use on local computer##########
+    #file   = "input/alk.traj.dcd"
+    #######################################
+    infile = DCDFile(file)
+    pairfile = open("cupy_RDF.dat", "w+")
+    stwo = open("cupy_Pair_entropy.dat", "w+")
+
+    numatm, nconf = dcdreadhead(infile)
+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
+    if inconf > nconf:
+        print("nconf is reset to {}".format(nconf))
+    else:
+        nconf = inconf
+    print("Calculating RDF for {} frames".format(nconf))
+    #numatm = 10
+    sizef =  nconf * numatm
+    sizebin = nbin
+
+    ########### reading cordinates ##############
+    nvtx.RangePush("Read_File")
+    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)
+    nvtx.RangePop()  # pop for reading file
+    print("Reading of input file is completed")
+    ############# Stream from Host to Device #########################
+    d_x = cp.asarray(d_x)
+    d_y = cp.asarray(d_y)
+    d_z = cp.asarray(d_z)
+    d_g2 = cp.zeros(sizebin, dtype=cp.int64)
+
+    ############################## RAW KERNEL #################################################
+    nthreads = 128;
+    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);
+    nblock = (near2 / nthreads);
+    print(" Initial blocks are {} and now changing to".format(nblock))
+    maxblock = 65535
+    blockloop = int(nblock / maxblock)
+    if blockloop != 0:
+        nblock = maxblock
+    print("{} and will run over {} blockloops".format(nblock, blockloop+1))
+
+    nvtx.RangePush("CuPy_Pair_Circulation")
+    #################################
+    t1 = timer()
+    for bl in range(blockloop+1):
+        raw_kernel((nblock,),(nthreads,), (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)) ## cupy raw kernel
+    cp.cuda.Device(0).synchronize()
+    print("Kernel compute time:", timer() - t1)
+    d_g2 = cp.asnumpy(d_g2)
+    nvtx.RangePop()  # pop for Pair Calculation
+    ######################################################################
+    pi = math.acos(np.long(-1.0))
+    rho = (numatm) / (xbox * ybox * zbox)
+    norm = (np.long(4.0) * pi * rho) / np.long(3.0)
+    g2 = np.zeros(nbin, dtype=np.float32)
+    s2 =np.long(0.0); s2bond = np.long(0.0)
+    lngrbond = np.float(0.0)
+    box = min(xbox, ybox)
+    box = min(box, zbox)
+    _del =box / (np.long(2.0) * nbin)
+    gr = np.float(0.0)
+    # loop to calculate entropy
+    nvtx.RangePush("Entropy_Calculation")
+    for i in range(nbin):
+        rl = (i) * _del
+        ru = rl + _del
+        nideal = norm * (ru * ru * ru - rl * rl * rl)
+        g2[i] = d_g2[i] / (nconf * numatm * nideal)
+        r = (i) * _del
+        temp = (i + 0.5) * _del
+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
+
+        if r < np.long(2.0):
+            gr = np.long(0.0)
+        else:
+            gr = g2[i]
+        if gr < 1e-5:
+            lngr = np.long(0.0)
+        else:
+            lngr = math.log(gr)
+        if g2[i] < 1e-6:
+            lngrbond = np.long(0.0)
+        else:
+            lngrbond = math.log(g2[i])
+        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)
+        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r
+
+    nvtx.RangePop()  # pop for entropy Calculation
+    stwo.writelines("s2 value is {}\n".format(s2))
+    stwo.writelines("s2bond value is {}".format(s2bond))
+
+    print("#Freeing Host memory")
+    del (d_x)
+    del (d_y)
+    del (d_z)
+    del (d_g2)
+    print("#Number of atoms processed: {}  \n".format(numatm))
+    print("#number of confs processed: {} \n".format(nconf))
+    total_time = timer() - start
+    print("total time spent:", total_time)
+
+##################################################################################
+
+raw_kernel = cp.RawKernel(r'''
+extern "C"
+__global__ void cupy_pair_gpu(
+		const double* d_x, const double* d_y, const double* d_z, 
+		unsigned long long int *d_g2, int numatm, int nconf, 
+		const double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)
+{
+	double r,cut,dx,dy,dz;
+	int ig2,id1,id2;
+	double box;
+	box=min(xbox,ybox);
+	box=min(box,zbox);
+
+	double del=box/(2.0*d_bin);
+	cut=box*0.5;
+	int thisi;
+	double n;
+
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	int maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));
+
+	if ( i < maxi ) {
+		thisi=bl*65535*128+i;
+
+		n=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));
+		id1=int(n);
+		id2=thisi-(0.5*id1*(id1-1));
+
+		for (int frame=0;frame<nconf;frame++){
+			dx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];
+			dy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];
+			dz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];
+
+			dx=dx-xbox*(round(dx/xbox));
+			dy=dy-ybox*(round(dy/ybox));
+			dz=dz-zbox*(round(dz/zbox));
+
+			r=sqrtf(dx*dx+dy*dy+dz*dz);
+			if (r<cut) {
+				ig2=(int)(r/del);
+				atomicAdd(&d_g2[ig2],2) ;
+			}
+		}
+	}
+}
+''', 'cupy_pair_gpu')
+
+if __name__ == "__main__":
+    main()
+

+ 9 - 0
hpc/nways/nways_labs/nways_MD/English/Python/source_code/input/dataset.py

@@ -0,0 +1,9 @@
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
+
+import gdown
+import os
+
+## alk.traj.dcd input file 
+url = 'https://drive.google.com/uc?id=1WZ0rtXZ-uMLfy7htT0gaU4EQ_Rq61QTF&export=download'
+output = '/labs/nways_MD/English/C/source_code/input/alk.traj.dcd'
+gdown.download(url, output, quiet=False,proxy=None)

+ 193 - 0
hpc/nways/nways_labs/nways_MD/English/Python/source_code/numba/numba_rdf.py

@@ -0,0 +1,193 @@
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
+
+import numpy as np
+import math
+import cupy.cuda.nvtx as nvtx
+from MDAnalysis.lib.formats.libdcd import DCDFile
+from timeit import default_timer as timer
+import numba.cuda as cuda
+import os
+from pathlib import Path
+
+def dcdreadhead(infile):
+    nconf   = infile.n_frames
+    _infile = infile.header
+    numatm  = _infile['natoms']
+    return numatm, nconf
+
+def dcdreadframe(infile, numatm, nconf):
+
+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
+
+    for i in range(nconf):
+        data = infile.readframes(i, i+1)
+        box = data[1]
+        atomset = data[0][0]
+        xbox = round(box[0][0], 8)
+        ybox = round(box[0][2],8)
+        zbox = round(box[0][5], 8)
+
+        for row in range(numatm):
+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
+
+    return xbox, ybox, zbox, d_x, d_y, d_z
+
+def main():
+    start = timer()
+    ########## Input Details ###########
+    global xbox, ybox, zbox
+    inconf = 10
+    nbin   =np.int(2000)
+    xbox   = np.float(0)
+    ybox   =np.float(0)
+    zbox   = np.float(0)
+
+    ########use on jupyter notebook#######
+    fileDir = os.path.dirname(os.path.realpath('__file__'))
+    dataRoot = Path(fileDir).parents[1]
+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
+
+    ########use on local computer##########
+    # file   = "input/alk.traj.dcd"
+    #######################################
+
+    infile = DCDFile(file)
+
+    pairfile = open("numba_RDF.dat", "w+")
+    stwo = open("numba_Pair_entropy.dat", "w+")
+
+    numatm, nconf = dcdreadhead(infile)
+
+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
+    if inconf > nconf:
+        print("nconf is reset to {}".format(nconf))
+    else:
+        nconf = inconf
+    print("Calculating RDF for {} frames".format(nconf))
+
+    #numatm = 100
+    sizef =  nconf * numatm
+    sizebin = nbin
+
+    ########### reading cordinates ##############
+    nvtx.RangePush("Read_File")
+    xbox, ybox, zbox, d_x, d_y, d_z = dcdreadframe(infile, numatm, nconf)
+    nvtx.RangePop()  # pop for reading file
+    print("Reading of input file is completed")
+
+    ############################## Numba KERNEL #################################################
+
+    nthreads = 128;
+    near2 = nthreads * (int(0.5 * numatm * (numatm - 1) / nthreads) + 1);
+    nblock = (near2 / nthreads);
+    print(" Initial blocks are {} and now changing to".format(nblock))
+    maxblock = 65535
+    blockloop = int(nblock / maxblock)
+    if blockloop != 0:
+        nblock = maxblock
+    print("{} and will run over {} blockloops".format(nblock, blockloop+1))
+
+    # cp.cuda.runtime.memset(d_g2,0,sizebin)
+    d_g2 = np.zeros(sizebin, dtype=np.int64)
+    d_g2 = cuda.to_device(d_g2) #numba copy to device
+
+    nvtx.RangePush("Pair_Circulation_Numba")
+    t1 = timer()
+    for bl in range(blockloop+1):
+        pair_gpu_kernel[nblock,nthreads ](d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl)  ## numba jit kernel
+    cuda.synchronize()
+    print("Kernel compute time:", timer() - t1)
+    d_g2  = d_g2.copy_to_host() ## numba copy to host
+    nvtx.RangePop()  # pop for Pair Calculation
+
+    pi = math.acos(np.long(-1.0))
+    rho = (numatm) / (xbox * ybox * zbox)
+    norm = (np.long(4.0) * pi * rho) / np.long(3.0)
+    g2 = np.zeros(nbin, dtype=np.float32)
+    s2 =np.long(0.0); s2bond = np.long(0.0)
+    lngrbond = np.float(0.0)
+    box = min(xbox, ybox)
+    box = min(box, zbox)
+    _del =box / (np.long(2.0) * nbin)
+    gr = np.float(0.0)
+    # loop to calculate entropy
+    nvtx.RangePush("Entropy_Calculation")
+    for i in range(nbin):
+        rl = (i) * _del
+        ru = rl + _del
+        nideal = norm * (ru * ru * ru - rl * rl * rl)
+        g2[i] = d_g2[i] / (nconf * numatm * nideal)
+        r = (i) * _del
+        temp = (i + 0.5) * _del
+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
+
+        if r < np.long(2.0):
+            gr = np.long(0.0)
+        else:
+            gr = g2[i]
+        if gr < 1e-5:
+            lngr = np.long(0.0)
+        else:
+            lngr = math.log(gr)
+        if g2[i] < 1e-6:
+            lngrbond = np.long(0.0)
+        else:
+            lngrbond = math.log(g2[i])
+        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)
+        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r
+
+    nvtx.RangePop()  # pop for entropy Calculation
+    stwo.writelines("s2 value is {}\n".format(s2))
+    stwo.writelines("s2bond value is {}".format(s2bond))
+
+    print("#Freeing Host memory")
+    del (d_x)
+    del (d_y)
+    del (d_z)
+    del (d_g2)
+    print("#Number of atoms processed: {}  \n".format(numatm))
+    print("#number of confs processed: {} \n".format(nconf))
+    total_time = timer() - start
+    print("total time spent:", total_time)
+
+
+##--------------------------NUMBA KERNEL---------------------------------------------------------##
+@cuda.jit
+def pair_gpu_kernel(d_x, d_y,d_z, d_g2, numatm, nconf, xbox, ybox,zbox,d_bin, bl):
+    box = min(xbox, ybox)
+    box = min(box, zbox)
+    _del= box / (2.0 * d_bin)
+    cut = box * 0.5;
+    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+    maxi = min(int(0.5 * numatm * (numatm - 1) - (bl * 65535 * 128)), (65535 * 128))
+
+    if i < maxi:
+        thisi=bl * 65535 * 128+i
+        n = (0.5) * (1+ ( math.sqrt (1.0+4.0 * 2.0 * thisi)))
+        id1 = int(n)
+        id2 = thisi-(0.5 * id1 * (id1-1))
+        for frame in range(0, nconf):
+            t1 = int(frame * numatm+id1)
+            t2 = int(frame * numatm+id2)
+            dx = d_x[t1] - d_x[t2]
+            dy = d_y[t1] - d_y[t2]
+            dz = d_z[t1] - d_z[t2]
+            dx = dx - xbox * (round(dx / xbox))
+            dy = dy - ybox * (round(dy / ybox))
+            dz = dz - zbox * (round(dz / zbox))
+
+            r= math.sqrt(dx * dx+dy * dy+dz * dz)
+            if r < cut:
+                ig2=(int)(r / _del )
+                cuda.atomic.add(d_g2, ig2, 2)
+
+##--------------------------END NUMBA KERNEL---------------------------------------------------------##
+
+
+if __name__ == "__main__":
+    main()
+

+ 157 - 0
hpc/nways/nways_labs/nways_MD/English/Python/source_code/serial/nways_serial.py

@@ -0,0 +1,157 @@
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
+
+import numpy as np
+import math
+import cupy.cuda.nvtx as nvtx
+from MDAnalysis.lib.formats.libdcd import DCDFile
+from timeit import default_timer as timer
+from numba import  njit
+import os
+from pathlib import Path
+
+def dcdreadhead(infile):
+    nconf   = infile.n_frames
+    _infile = infile.header
+    numatm  = _infile['natoms']
+    return numatm, nconf
+
+def dcdreadframe(infile, numatm, nconf):
+
+    d_x = np.zeros(numatm * nconf, dtype=np.float64)
+    d_y = np.zeros(numatm * nconf, dtype=np.float64)
+    d_z = np.zeros(numatm * nconf, dtype=np.float64)
+
+    for i in range(nconf):
+        data = infile.readframes(i, i+1)
+        box = data[1]
+        atomset = data[0][0]
+        xbox = round(box[0][0], 8)
+        ybox = round(box[0][2],8)
+        zbox = round(box[0][5], 8)
+
+        for row in range(numatm):
+            d_x[i * numatm + row] = round(atomset[row][0], 8) # 0 is column
+            d_y[i * numatm + row] = round(atomset[row][1], 8)  # 1 is column
+            d_z[i * numatm + row] = round(atomset[row][2], 8)  # 2 is column
+
+    return xbox, ybox, zbox, d_x, d_y, d_z
+
+def main():
+    start = timer()
+    ########## Input Details ###########
+    inconf = 10
+    nbin   = 2000
+    global xbox, ybox, zbox
+    
+    fileDir = os.path.dirname(os.path.realpath('__file__'))
+    dataRoot = Path(fileDir).parents[1]
+    file = os.path.join(dataRoot, 'source_code/input/alk.traj.dcd')
+    
+    infile = DCDFile(file)
+    pairfile = open("RDF.dat", "w+")
+    stwo     = open("Pair_entropy.dat", "w+")
+
+    numatm, nconf = dcdreadhead(infile)
+    print("Dcd file has {} atoms and {} frames".format(numatm, nconf))
+    if inconf > nconf:
+        print("nconf is reset to {}".format(nconf))
+    else:
+        nconf = inconf
+    print("Calculating RDF for {} frames".format(nconf))
+    #numatm = 50
+    sizef =  nconf * numatm
+    sizebin = nbin
+    ########### reading cordinates ##############
+    nvtx.RangePush("Read_File")
+    xbox, ybox, zbox, h_x, h_y, h_z = dcdreadframe(infile, numatm, nconf)
+    nvtx.RangePop() # pop for reading file
+
+    h_g2 = np.zeros(sizebin, dtype=np.longlong)
+    print("Reading of input file is completed")
+    ############# This where we will concentrate #########################
+    nvtx.RangePush("Pair_Circulation")
+    h_g2 = pair_gpu(h_x, h_y, h_z, h_g2, numatm, nconf, xbox, ybox, zbox, nbin)
+    nvtx.RangePop() #pop for Pair Calculation
+    ######################################################################
+    pi = math.acos(np.long(-1.0))
+    rho = (numatm) / (xbox * ybox * zbox)
+    norm = (np.long(4.0) * pi * rho) / np.long(3.0)
+    g2 = np.zeros(nbin, dtype=np.float32)
+    s2 = np.long(0.0);
+    s2bond = np.long(0.0)
+    lngrbond = np.float(0.0)
+    box = min(xbox, ybox)
+    box = min(box, zbox)
+    _del = box / (np.long(2.0) * nbin)
+    gr = np.float(0.0)
+    # loop to calculate entropy
+    nvtx.RangePush("Entropy_Calculation")
+    for i in range(nbin):
+        rl = (i) * _del
+        ru = rl + _del
+        nideal = norm * (ru * ru * ru - rl * rl * rl)
+        g2[i] = h_g2[i] / (nconf * numatm * nideal)
+        r = (i) * _del
+        temp = (i + 0.5) * _del
+        pairfile.write(str(temp) + " " + str(g2[i]) + "\n")
+
+        if r < np.long(2.0):
+            gr = np.long(0.0)
+        else:
+            gr = g2[i]
+        if gr < 1e-5:
+            lngr = np.long(0.0)
+        else:
+            lngr = math.log(gr)
+        if g2[i] < 1e-6:
+            lngrbond = np.long(0.0)
+        else:
+            lngrbond = math.log(g2[i])
+        s2 = s2 - (np.long(2.0) * pi * rho * ((gr * lngr) - gr + np.long(1.0)) * _del * r * r)
+        s2bond = s2bond - np.long(2.0) * pi * rho * ((g2[i] * lngrbond) - g2[i] + np.long(1.0)) * _del * r * r
+
+    nvtx.RangePop() # pop for entropy Calculation
+    stwo.writelines("s2 value is {}\n".format(s2))
+    stwo.writelines("s2bond value is {}".format(s2bond))
+
+    print("\n s2 value is {}\n".format(s2))
+    print("s2bond value is {}\n".format(s2bond))
+    print("#Freeing Host memory")
+    del(h_x)
+    del(h_y)
+    del(h_z)
+    del(h_g2)
+    print("#Number of atoms processed: {}  \n".format(numatm))
+    print("#number of confs processed: {} \n".format(nconf))
+    total_time = timer() - start
+    print("total time spent:", total_time)
+
+@njit()
+def pair_gpu(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, d_bin):
+    box = min(xbox, ybox)
+    box = min(box, zbox)
+    _del = box / (2.0 * d_bin)
+    cut = box * 0.5
+    #print("\n {} {}".format(nconf, numatm))
+
+    for frame in range(nconf):
+        #print("\n {}".format(frame))
+        for id1 in range(numatm):
+            for id2 in range(numatm):
+                dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2]
+                dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2]
+                dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2 ]
+                dx = dx - xbox * (round(dx / xbox))
+                dy = dy - ybox * (round(dy / ybox))
+                dz = dz - zbox * (round(dz / zbox))
+
+                r = math.sqrt(dx * dx + dy * dy + dz * dz)
+                if r < cut :
+                    ig2  = int((r/_del))
+                    d_g2[ig2] = d_g2[ig2] + 1
+
+    return d_g2
+
+
+if __name__ == "__main__":
+    main()

+ 17 - 12
hpc/nways/nways_labs/nways_MD/English/nways_MD_start.ipynb

@@ -11,7 +11,7 @@
     "* Standard: C++ stdpar, Fortran Do-Concurrent\n",
     "* Directives: OpenACC, OpenMP\n",
     "* Frameworks: Kokkos\n",
-    "* Programming Language Extension: CUDA C, CUDA Fortran\n",
+    "* Programming Language Extension: CUDA C, CUDA Fortran, Python Cupy, Python Numba\n",
     "\n",
     "Let's start with testing the CUDA Driver and GPU you are running the code on in this lab:"
    ]
@@ -29,7 +29,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<!--**IMPORTANT**: Before we start please download the input file needed for this application from the [Google drive](https://drive.google.com/drive/folders/1aQ_MFyrjBIDMhCczse0S2GQ36MlR6Q_s?usp=sharing) and upload it to the input folder. From the top menu, click on *File*, and *Open* and navigate to `C/source_code/input` directory and copy paste the downloaded input file (`alk.traj.dcd`).-->\n",
+    "<!--**IMPORTANT**: Before we start please download the input file needed for this application from the [Google drive](https://drive.google.com/drive/folders/1aQ_MFyrjBIDMhCczse0S2GQ36MlR6Q_s?usp=sharing) and upload it to the input folder. From the top menu, click on *File*, and *Open* and navigate to `C/source_code/input` or `Python/source_code/input` directory and copy paste the downloaded input file (`alk.traj.dcd`).-->\n",
     "\n",
     "\n",
     "### Tutorial Outline\n",
@@ -72,7 +72,19 @@
     "3. [OpenMP](Fortran/jupyter_notebook/openmp/nways_openmp.ipynb) \n",
     "4. [CUDA Fortran](Fortran/jupyter_notebook/cudafortran/nways_cuda.ipynb) \n",
     "\n",
-    "To finish the lab let us go through some final [remarks](Fortran/jupyter_notebook/Final_Remarks.ipynb)\n"
+    "To finish the lab let us go through some final [remarks](Fortran/jupyter_notebook/Final_Remarks.ipynb)\n",
+    "\n",
+    "\n",
+    "#### Python Programming Language\n",
+    "\n",
+    "Please read the [RDF Overview](Python/jupyter_notebook/serial/rdf_overview.ipynb) to get familiar with how this application works.\n",
+    "\n",
+    "To get started, click on the following GPU programming approaches in python:\n",
+    "\n",
+    "1. [CuPy](./Python/jupyter_notebook/cupy/cupy_guide.ipynb)\n",
+    "2. [Numba](./Python/jupyter_notebook/numba/numba_guide.ipynb)\n",
+    "\n",
+    "To round up this tutorial, see some final [remarks on Python](Python/jupyter_notebook/Final_Remarks.ipynb)"
    ]
   },
   {
@@ -90,7 +102,7 @@
     "### Target Audience and Prerequisites\n",
     "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
     "\n",
-    "Basic experience with Fortran programming is needed. No GPU programming knowledge is required.\n",
+    "Basic experience with C/C++ or Python or Fortran programming is needed. No GPU programming knowledge is required.\n",
     "\n",
     "-----\n",
     "\n",
@@ -102,13 +114,6 @@
     "\n",
     "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -127,7 +132,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,

+ 3 - 3
hpc/nways/nways_labs/nways_start.ipynb

@@ -11,7 +11,7 @@
     "* Standard: C++ stdpar, Fortran Do-Concurrent\n",
     "* Directives: OpenACC, OpenMP\n",
     "* Frameworks: Kokkos\n",
-    "* Programming Language Extension: CUDA C, CUDA Fortran\n",
+    "* Programming Language Extension: CUDA C, CUDA Fortran, Python CuPy, Python Numba\n",
     "\n",
     "Let's start with testing the CUDA Driver and GPU you are running the code on in this lab:"
    ]
@@ -49,7 +49,7 @@
     "### Target Audience and Prerequisites\n",
     "The target audience for this lab is researchers/graduate students and developers who are interested in learning about programming various ways to programming GPUs to accelerate their scientific applications.\n",
     "\n",
-    "Basic experience with C/C++ or Fortran programming is needed. No GPU programming knowledge is required. \n",
+    "Basic experience with C/C++ or Python or Fortran programming is needed. No GPU programming knowledge is required. \n",
     "\n",
     "--- \n",
     "\n",
@@ -75,7 +75,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,