5 rokov pred · aed7c05845
--- a/hpc/miniprofiler/Dockerfile
+++ b/hpc/miniprofiler/Dockerfile
@@ -0,0 +1,146 @@
 
				+# To build: $ sudo docker build -t myimage:1.0 .
			
 
				+
			
 
				+# To run: $ sudo docker run --rm -it --gpus=all -p 8888:8888 myimage:1.0
			
 
				+
			
 
				+# To run Jupyter inside the container: $ jupyter notebook --ip 0.0.0.0 --port 8888 --no-browser --allow-root
			
 
				+
			
 
				+FROM nvcr.io/hpc/pgi-compilers:ce
			
 
				+
			
 
				+RUN apt-get update && \
			
 
				+    apt-get dist-upgrade -y && \
			
 
				+    apt-get install --no-install-recommends -y \
			
 
				+    openssh-server m4 vim-nox emacs-nox nano zip && \
			
 
				+    rm -rf /var/lib/apt/cache/* 
			
 
				+#useradd -k /etc/skel -m -s /usr/local/bin/entrypoint.sh -p openacc openacc && \
			
 
				+#echo 'openacc:openacc' | chpasswd && \
			
 
				+#mkdir /var/run/sshd 
			
 
				+
			
 
				+RUN apt-get install --no-install-recommends -y python3 python3-pip
			
 
				+RUN pip3 install --upgrade pip
			
 
				+RUN apt-get install --no-install-recommends -y python3-setuptools
			
 
				+RUN apt-get install --no-install-recommends -y git-core
			
 
				+RUN apt-get install --no-install-recommends -y openssh-client 
			
 
				+RUN apt-get install --no-install-recommends -y inotify-tools 
			
 
				+RUN pip3 install jupyter
			
 
				+RUN pip3 install netcdf4
			
 
				+RUN apt-get install --no-install-recommends -y curl
			
 
				+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
			
 
				+RUN apt-get install --no-install-recommends -y git-lfs
			
 
				+RUN git lfs install
			
 
				+
			
 
				+# NVIDIA Nsight Systems 
			
 
				+RUN apt-get update -y && \
			
 
				+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
			
 
				+    apt-transport-https \
			
 
				+    ca-certificates \
			
 
				+    gnupg \
			
 
				+    wget && \
			
 
				+    echo "deb https://developer.download.nvidia.com/devtools/repo-deb/x86_64/ /" >> /etc/apt/sources.list.d/nsight.list && \
			
 
				+    apt-get update -y
			
 
				+
			
 
				+RUN apt-get update -y
			
 
				+
			
 
				+# NVIDIA Nsight Systems 2020.3.1
			
 
				+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.3.1
			
 
				+
			
 
				+
			
 
				+#RUN apt-get update && apt-get install -y --no-install-recommends && \
			
 
				+#    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
			
 
				+#    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list 
			
 
				+
			
 
				+# NVIDIA Nsight Systems 2019.3.7
			
 
				+#RUN apt-get update -y && \ 
			
 
				+#    apt-get install -y cuda-nsight-systems-10-1 nsight-systems-2019.3.7
			
 
				+
			
 
				+RUN apt-get install --no-install-recommends -y build-essential
			
 
				+
			
 
				+##### START : netcdf installation #####
			
 
				+
			
 
				+RUN cd /usr && \
			
 
				+    mkdir netcdf && \
			
 
				+    cd netcdf && \
			
 
				+    mkdir WORK_DIR && \
			
 
				+    cd /usr/netcdf/WORK_DIR && \
			
 
				+    mkdir zlib hdf5 netCDF netCDF-C++ netCDF-Fortran
			
 
				+
			
 
				+RUN cd /usr/netcdf/WORK_DIR/zlib && \
			
 
				+    wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/zlib-1.2.8.tar.gz && \
			
 
				+    tar -xvzf zlib-1.2.8.tar.gz
			
 
				+
			
 
				+RUN cd /usr/netcdf/WORK_DIR/hdf5 && \
			
 
				+    wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/hdf5-1.8.12.tar.gz && \
			
 
				+    tar -xvzf hdf5-1.8.12.tar.gz
			
 
				+
			
 
				+RUN cd /usr/netcdf/WORK_DIR/netCDF && \
			
 
				+    wget ftp://ftp.unidata.ucar.edu/pub/netcdf/old/netcdf-4.3.0.tar.gz && \
			
 
				+    tar -xvzf netcdf-4.3.0.tar.gz 
			
 
				+
			
 
				+RUN cd /usr/netcdf/WORK_DIR/netCDF-C++ && \
			
 
				+    wget https://github.com/Unidata/netcdf-cxx4/archive/v4.2.1.tar.gz && \
			
 
				+    tar -xvzf v4.2.1.tar.gz
			
 
				+
			
 
				+RUN cd /usr/netcdf/WORK_DIR/netCDF-Fortran && \
			
 
				+    wget ftp://ftp.unidata.ucar.edu/pub/netcdf/old/netcdf-fortran-4.2.tar.gz && \
			
 
				+    tar -xvzf netcdf-fortran-4.2.tar.gz
			
 
				+
			
 
				+ENV CC=pgcc CFLAGS="-O tp=p7-64" CXX=pgc++ CXXFLAGS="-O tp=p7-64" FC=pgfortran FCFLAGS="-O tp=p7-64" F77=pgfortran FFLAGS="-O tp=p7-64" CPPFLAGS="-DpgiFortran" 
			
 
				+
			
 
				+RUN mkdir -p /usr/local && \
			
 
				+    mkdir -p /usr/local/bin && \
			
 
				+    mkdir -p /usr/local/include && \
			
 
				+    mkdir -p /usr/local/lib 
			
 
				+
			
 
				+ENV PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib" LIBRARY_PATH="$LIBRARY_PATH:/usr/local/lib" CPATH="$CPATH:/usr/local/include"
			
 
				+
			
 
				+# zlib 
			
 
				+RUN cd /usr/netcdf/WORK_DIR/zlib/zlib-1.2.8 && \
			
 
				+    ./configure --prefix=/usr/local && \
			
 
				+    make  && \
			
 
				+    make install 
			
 
				+
			
 
				+# hdf5
			
 
				+RUN cd  /usr/netcdf/WORK_DIR/hdf5/hdf5-1.8.12 && \
			
 
				+    unset CPP && \
			
 
				+    ./configure --prefix=/usr/local --enable-fortran --enable-c++ && \
			
 
				+    make  && \
			
 
				+    make install 
			
 
				+
			
 
				+# netcdf
			
 
				+RUN export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib" && \
			
 
				+    export CFLAGS="-O tp=p7-64 -I /usr/local/include" && \
			
 
				+    export LDFLAGS="-L/usr/local/lib -L/usr/local/lib" && \
			
 
				+    cd  /usr/netcdf/WORK_DIR/netCDF/netcdf-4.3.0 && \
			
 
				+    ./configure --prefix=/usr/local && \
			
 
				+    make  && \
			
 
				+    make install  
			
 
				+
			
 
				+# netcdf-c++
			
 
				+RUN export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/hdf5/lib:/usr/local/lib" && \
			
 
				+    export CFLAGS="-O tp=p7-64 -I /usr/local/include -I /usr/local/include" && \
			
 
				+    export CPPFLAGS="-DpgiFortran -I /usr/local/include -I /usr/local/include" && \
			
 
				+    export LDFLAGS="-L/usr/local/lib -L/usr/local/lib" && \
			
 
				+    cd  /usr/netcdf/WORK_DIR/netCDF-C++/netcdf-cxx4-4.2.1 && \
			
 
				+    ./configure --prefix=/usr/local && \
			
 
				+    make  && \
			
 
				+    make install 
			
 
				+
			
 
				+# netcdf-fortran
			
 
				+RUN export CFLAGS="-O tp=p7-64 -I /usr/local/include -I /usr/local/include" && \
			
 
				+    export FCFLAGS="-O tp=p7-64 -I /usr/local/include -I /usr/local/include" && \
			
 
				+    export FFLAGS="-O tp=p7-64 -I /usr/local/include -I /usr/local/include" && \
			
 
				+    export CPPFLAGS="-DpgiFortran -I /usr/local/include -I /usr/local/include" && \
			
 
				+    unset LDFLAGS && \
			
 
				+    cd  /usr/netcdf/WORK_DIR/netCDF-Fortran/netcdf-fortran-4.2 && \
			
 
				+    ./configure --prefix=/usr/local && \
			
 
				+    make && \
			
 
				+    make install   
			
 
				+##### END : netcdf installation #####
			
 
				+
			
 
				+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
			
 
				+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3  && \
			
 
				+    rm Miniconda3-latest-Linux-x86_64.sh && \
			
 
				+    /opt/anaconda3/bin/conda install -y -q netcdf4
			
 
				+
			
 
				+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/pgi/linux86-64-llvm/2019/cuda/10.1/lib64/" 
			
 
				+
			
 
				+ADD miniapps-profiler /labs
			
--- a/hpc/miniprofiler/English/.ipynb_checkpoints/profiling_start-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/.ipynb_checkpoints/profiling_start-checkpoint.ipynb
@@ -0,0 +1,101 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Profiling Tutorial\n",
			
 
				+    "\n",
			
 
				+    "### Learning objectives\n",
			
 
				+    "Learn how to profile your application with NVIDIA Nsight Systems and NVTX API calls to find performance limiters and bottlenecks and apply incremental parallelization strategies using OpenACC programming model. In this lab, you will:\n",
			
 
				+    "\n",
			
 
				+    "- Understand what a profiler is and which NVIDIA Nsight tool to choose in order to profile your application\n",
			
 
				+    "- Profile a sequential weather modeling application (integrated with NVIDIA Tools Extension (NVTX) APIs) with NVIDIA Nsight Systems to capture and trace CPU events and time ranges\n",
			
 
				+    "- Understand how to use NVIDIA Nsight Systems profiler’s report to detect hotspots and apply OpenACC compute constructs to the serial application to parallelise it on the GPU\n",
			
 
				+    "- Learn how to use Nsight Systems to identify issues such as underutilized GPU device and unnecessary data movements in the application and to apply optimization strategies steps by steps to expose more parallelism and utilize computer’s CPU and GPU\n",
			
 
				+    "- Learn how to use occupancy to address performance limitations\n",
			
 
				+    "- Learn to follow cyclical process (analyze, parallelize, optimize) to help you identify the portions of the code that would benefit from GPU acceleration and apply parallelisation strategies and optimization techniques to see additional speedups and improve performance\n",
			
 
				+    "\n",
			
 
				+    "In this lab, we will be optimizing the serial Weather Simulation application written in both C and Fortran programming language. You are welcome to have a look at the mini weather lab and follow the steps to familiarize yourself with the application. \n",
			
 
				+    "\n",
			
 
				+    "An optional exercise on how to use Nsight Compute profiler is available for advanced users. This exercise covers basics on how and when to use the Nsight Compute profiler to get you started. Steps to unravel performance limiters will be presented through a simple exercise.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "### Tutorial Outline\n",
			
 
				+    "- Introduction ([C](C/jupyter_notebook/profiling-c.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran.ipynb))\n",
			
 
				+    "    - Overview of Nsight profiler tools\n",
			
 
				+    "    - How to use NVTX APIs\n",
			
 
				+    "    - Overview of [Mini Weather application](C/jupyter_notebook/miniweather.ipynb)\n",
			
 
				+    "    - Optimization Steps to parallel programming with OpneACC\n",
			
 
				+    "- Lab 1 ([C](C/jupyter_notebook/profiling-c-lab1.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran-lab1.ipynb))\n",
			
 
				+    "    - How to compile a serial application with PGI compiler\n",
			
 
				+    "    - How to profile a serial application with Nsight Systems and NVTX APIs\n",
			
 
				+    "    - How to use profiler's report to find hotspots\n",
			
 
				+    "    - Scaling and Amdahl's law and why it matters\n",
			
 
				+    "- Lab 2 ([C](C/jupyter_notebook/profiling-c-lab2.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran-lab2.ipynb))\n",
			
 
				+    "    - Parallelise the serial application using OpenACC compute directives\n",
			
 
				+    "    - How to compile a parallel application with PGI compiler\n",
			
 
				+    "    - What does the compiler feedback tell us\n",
			
 
				+    "    - Profile with Nsight Systems\n",
			
 
				+    "    - Finding bottlenecks from Nsight Systems report\n",
			
 
				+    "- Lab 3 ([C](C/jupyter_notebook/profiling-c-lab3.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran-lab3.ipynb))\n",
			
 
				+    "    - How to combine the knowledge from compiler feedback and profiler to optimize the application\n",
			
 
				+    "    - What is occupancy\n",
			
 
				+    "    - Demystifying Gangs, Workers, and Vectors\n",
			
 
				+    "    - Apply collapse clause to optimize the application further\n",
			
 
				+    "- Lab 4 ([C](C/jupyter_notebook/profiling-c-lab4.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran-lab4.ipynb))\n",
			
 
				+    "    - Inspect data movement from the profiler's report\n",
			
 
				+    "    - Data management with OpenACC\n",
			
 
				+    "    - Apply incremental parallelization strategies and use profiler's report for the next step\n",
			
 
				+    "- Lab 5 ([C](C/jupyter_notebook/profiling-c-lab5.ipynb) , [Fortran](Fortran/jupyter_notebook/profiling-fortran-lab5.ipynb))\n",
			
 
				+    "    - Overview of Nsight Compute\n",
			
 
				+    "    - When and How to use Nsight Compute\n",
			
 
				+    "    - What does the profiler tell us, where is the bottleneck\n",
			
 
				+    "    - How to use baselines with Nsight Compute\n",
			
 
				+    "    \n",
			
 
				+    "\n",
			
 
				+    "### Tutorial Duration\n",
			
 
				+    "The lab material will be presented in a 2hr session. Link to material is available for download at the end of the lab.\n",
			
 
				+    "\n",
			
 
				+    "### Content Level\n",
			
 
				+    "Beginner, Intermediate\n",
			
 
				+    "\n",
			
 
				+    "### Target Audience and Prerequisites\n",
			
 
				+    "The target audience for this lab is researchers/graduate students and developers who are interested in getting hands on experience with the NVIDIA Nsight System through profiling a real life parallel application using OpenACC programming model and NVTX.\n",
			
 
				+    "\n",
			
 
				+    "While this tutorial does not assume any expertise in CUDA experience, basic knowledge of OpenACC programming (e.g: compute constructs), GPU architecture, and programming experience with C/C++ or Fortran is desirable.\n",
			
 
				+    "\n",
			
 
				+    "### Start Here\n",
			
 
				+    "You can choose between a [C-based code](C/jupyter_notebook/profiling-c.ipynb) and a [Fortran-based code](Fortran/jupyter_notebook/profiling-fortran.ipynb).\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). \n"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/miniweather-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/miniweather-checkpoint.ipynb
@@ -0,0 +1,114 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# A MINI-WEATHER APPLICATION\n",
			
 
				+    "\n",
			
 
				+    "In this lab we will accelerate a Fluid Simulation in the context of atmosphere and weather simulation.\n",
			
 
				+    "The mini weather code mimics the basic dynamics seen in the atmspheric weather and climate.\n",
			
 
				+    "\n",
			
 
				+    "The figure below demonstrates how a narrow jet of fast and slightly cold wind is injected into a balanced, neutral atmosphere at rest from the left domain near the model.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Time.jpg\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "Simulation is a repetitive process from 0 to the desired simulated time, increasing by Δt on every iteration.\n",
			
 
				+    "Each Δt step is practically the same operation. Each simulation is solving a differential equation that represents how the flow of the atmosphere (fluid) changes according to small perturbations. To simplify this solution the code uses dimensional splitting: Each dimension X and Z are treated independently.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/X_Y.jpg\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "The differential equation has a time derivative that needs integrating, and a simple low-storage Runge-Kutta ODE solver is used to integrate the time derivative. Each time step, the order in which the dimentions are solved is reversed, giving second-order accuracy. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Range-Kutta.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "### The objective of this exercise is not to dwell into the Maths part of it but to make use of OpenACC to parallelize and improve the performance.\n",
			
 
				+    "\n",
			
 
				+    "The general flow of the code is as shown in diagram below. For each time step the differential equations are solved.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Outer_Loop.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "```cpp\n",
			
 
				+    "while (etime < sim_time) {\n",
			
 
				+    "    //If the time step leads to exceeding the simulation time, shorten it for the last step\n",
			
 
				+    "    if (etime + dt > sim_time) { dt = sim_time - etime; }\n",
			
 
				+    "    //Perform a single time step\n",
			
 
				+    "    perform_timestep(state,state_tmp,flux,tend,dt);\n",
			
 
				+    "    //Inform the user\n",
			
 
				+    "    if (masterproc) { printf( \"Elapsed Time: %lf / %lf\\n\", etime , sim_time ); }\n",
			
 
				+    "    //Update the elapsed time and output counter\n",
			
 
				+    "    etime = etime + dt;\n",
			
 
				+    "    output_counter = output_counter + dt;\n",
			
 
				+    "    //If it's time for output, reset the counter, and do output\n",
			
 
				+    "    if (output_counter >= output_freq) {\n",
			
 
				+    "      output_counter = output_counter - output_freq;\n",
			
 
				+    "      output(state,etime);\n",
			
 
				+    "    }\n",
			
 
				+    "  }\n",
			
 
				+    "  \n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "At every time step the direction is reversed to get second order derivative.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Time_Step.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "```cpp\n",
			
 
				+    "void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {\n",
			
 
				+    "  if (direction_switch) {\n",
			
 
				+    "    //x-direction first\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
			
 
				+    "    //z-direction second\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
			
 
				+    "  } else {\n",
			
 
				+    "    //z-direction second\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
			
 
				+    "    //x-direction first\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
			
 
				+    "  }\n",
			
 
				+    "  if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; }\n",
			
 
				+    "}\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Semi_Discrete.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab1-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab1-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab2-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab2-checkpoint.ipynb
@@ -0,0 +1,184 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "In this lab, we will optimize the weather simulation application written in C++ (if you prefer to use Fortran, click [this link](../../Fortran/jupyter_notebook/profiling-fortran.ipynb)). \n",
			
 
				+    "\n",
			
 
				+    "Let's execute the cell below to display information about the GPUs running on the server by running the pgaccelinfo command, which ships with the PGI compiler that we will be using. To do this, execute the cell block below by giving it focus (clicking on it with your mouse), and hitting Ctrl-Enter, or pressing the play button in the toolbar above. If all goes well, you should see some output returned below the grey cell."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!pgaccelinfo"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Exercise 2 \n",
			
 
				+    "\n",
			
 
				+    "### Learning objectives\n",
			
 
				+    "Learn how to identify and parallelise the computationally expensive routines in your application using OpenACC compute constructs (A compute construct is a parallel, kernels, or serial construct.). In this exercise you will:\n",
			
 
				+    "\n",
			
 
				+    "- Implement OpenACC parallelism using parallel directives to parallelise the serial application\n",
			
 
				+    "- Learn how to compile your parallel application with PGI compiler\n",
			
 
				+    "- Benchmark and compare the parallel version of the application with the serial version\n",
			
 
				+    "- Learn how to interpret PGI compiler feedback to ensure the applied optimization were successful"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "From the top menu, click on *File*, and *Open* `miniWeather_openacc.cpp` and `Makefile` from the current directory at `English/C/source_code/lab2` directory and inspect the code before running below cells. We have already added OpenACC compute directives (`#pragma acc parallel`) around the expensive routines (loops) in the code.\n",
			
 
				+    "\n",
			
 
				+    "Once done, compile the code with `make`. View the PGI compiler feedback (enabled by adding `-Minfo=accel` flag) and investigate the compiler feedback for the OpenACC code. The compiler feedback provides useful information about applied optimizations."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../source_code/lab2 && make"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's inspect part of the compiler feedback and see what it's telling us.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/cfeedback1.png\">\n",
			
 
				+    "\n",
			
 
				+    "- Using `-ta=tesla:managed`, instruct the compiler to build for an NVIDIA Tesla GPU using \"CUDA Managed Memory\"\n",
			
 
				+    "- Using `-Minfo` command-line option, we will see all output from the compiler. In this example, we use `-Minfo=accel` to only see the output corresponding to the accelerator (in this case an NVIDIA GPU).\n",
			
 
				+    "- The first line of the output, `compute_tendencies_x`, tells us which function the following information is in reference to.\n",
			
 
				+    "- The line starting with 227, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `277, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
			
 
				+    "- The rest of the information concerns data movement. Compiler detected possible need to move data and handled it for us. We will get into this later in this lab.\n",
			
 
				+    "\n",
			
 
				+    "It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it.\n",
			
 
				+    "\n",
			
 
				+    "Now, **Run** the application for small values of `nx_glob`,`nz_glob`, and `sim_time`: **400, 200, 10**"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../source_code/lab2 && nsys profile -t nvtx --stats=true --force-overwrite true -o miniWeather_3 ./miniWeather 400 200 10"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "You can see that the changes made actually slowed down the code and it runs slower compared to the non-accelerated CPU only version. Let's checkout the profiler's report. [Download the profiler output](../source_code/lab2/miniWeather_3.qdrep) and open it via the GUI. \n",
			
 
				+    "\n",
			
 
				+    "From the \"timeline view\" on the top pane, double click on the \"CUDA\" from the function table on the left and expand it. Zoom in on the timeline and you can see a pattern similar to the screenshot below. The blue boxes are the compute kernels and each of these groupings of kernels is surrounded by purple and teal boxes (annotated with red color) representing data movements. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/nsys_slow.png\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "Let's hover your mouse over kernels (blue boxes) one by one from each row and checkout the provided information.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/occu-1.png\" width=\"60%\" height=\"60%\">\n",
			
 
				+    "\n",
			
 
				+    "**Note**: In the next two exercises, we start optimizing the application by improving the occupancy and reducing data movements."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Post-Lab Summary\n",
			
 
				+    "\n",
			
 
				+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%%bash\n",
			
 
				+    "cd ..\n",
			
 
				+    "rm -f openacc_profiler_files.zip\n",
			
 
				+    "zip -r openacc_profiler_files.zip *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**After** executing the above zip command, you should be able to download the zip file [here](../openacc_profiler_files.zip)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../profiling_start.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling-c-lab3.ipynb>NEXT</a></span> </p>\n",
			
 
				+    "\n",
			
 
				+    "-----"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab3-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab3-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab4-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab4-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab5-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/.ipynb_checkpoints/profiling-c-lab5-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/3launch5skip.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/3launch5skip.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/MPI_Division.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/MPI_Division.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Nsight
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Nsight
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Optimization_Cycle.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Optimization_Cycle.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Outer_Loop.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Outer_Loop.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Range-Kutta.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Range-Kutta.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Screenshot
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Screenshot
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete_Step.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete_Step.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Time.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Time.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/Time_Step.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/Time_Step.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/X_Y.jpg
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/X_Y.jpg
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/allsection-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/allsection-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/baseline-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/baseline-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/c2compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/c2compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ccompute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ccompute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cexer5.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cexer5.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1-2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1-2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback4.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback4.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/charts-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/charts-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/checkerpy.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/checkerpy.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cli-out.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cli-out.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/compute-open.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/compute-open.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cpu.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cpu.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/cuda.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/cuda.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/diagram.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/diagram.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_gui.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_gui.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_terminal.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_terminal.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/f2compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/f2compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/fcompute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/fcompute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-0.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-0.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback3.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback3.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback4.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback4.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/fortran_nvtx.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/fortran_nvtx.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/fortranexer5.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/fortranexer5.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/fulllaunch.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/fulllaunch.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/laplas3.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/laplas3.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/launch-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/launch-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/list-set.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/list-set.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_data_mv.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_data_mv.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_fast_mv.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_fast_mv.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow_mv.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow_mv.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/nvtx.PNG
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/nvtx.PNG
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-3.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/occu-3.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/page-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/page-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q1-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q1-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q1-2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q1-2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1_zoom.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1_zoom.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-2_zoom.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q2-2_zoom.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q3-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q3-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q3-2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q3-2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2.PNG
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2.PNG
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom2.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom2.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/rule-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/rule-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/sections-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/sections-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/summary-compute.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/summary-compute.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/thread.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/thread.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/images/triangle.png
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/images/triangle.png
--- a/hpc/miniprofiler/English/C/jupyter_notebook/miniweather.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/miniweather.ipynb
@@ -0,0 +1,114 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# A MINI-WEATHER APPLICATION\n",
			
 
				+    "\n",
			
 
				+    "In this lab we will accelerate a Fluid Simulation in the context of atmosphere and weather simulation.\n",
			
 
				+    "The mini weather code mimics the basic dynamics seen in the atmspheric weather and climate.\n",
			
 
				+    "\n",
			
 
				+    "The figure below demonstrates how a narrow jet of fast and slightly cold wind is injected into a balanced, neutral atmosphere at rest from the left domain near the model.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Time.jpg\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "Simulation is a repetitive process from 0 to the desired simulated time, increasing by Δt on every iteration.\n",
			
 
				+    "Each Δt step is practically the same operation. Each simulation is solving a differential equation that represents how the flow of the atmosphere (fluid) changes according to small perturbations. To simplify this solution the code uses dimensional splitting: Each dimension X and Z are treated independently.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/X_Y.jpg\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "The differential equation has a time derivative that needs integrating, and a simple low-storage Runge-Kutta ODE solver is used to integrate the time derivative. Each time step, the order in which the dimentions are solved is reversed, giving second-order accuracy. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Range-Kutta.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "### The objective of this exercise is not to dwell into the Maths part of it but to make use of OpenACC to parallelize and improve the performance.\n",
			
 
				+    "\n",
			
 
				+    "The general flow of the code is as shown in diagram below. For each time step the differential equations are solved.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Outer_Loop.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "```cpp\n",
			
 
				+    "while (etime < sim_time) {\n",
			
 
				+    "    //If the time step leads to exceeding the simulation time, shorten it for the last step\n",
			
 
				+    "    if (etime + dt > sim_time) { dt = sim_time - etime; }\n",
			
 
				+    "    //Perform a single time step\n",
			
 
				+    "    perform_timestep(state,state_tmp,flux,tend,dt);\n",
			
 
				+    "    //Inform the user\n",
			
 
				+    "    if (masterproc) { printf( \"Elapsed Time: %lf / %lf\\n\", etime , sim_time ); }\n",
			
 
				+    "    //Update the elapsed time and output counter\n",
			
 
				+    "    etime = etime + dt;\n",
			
 
				+    "    output_counter = output_counter + dt;\n",
			
 
				+    "    //If it's time for output, reset the counter, and do output\n",
			
 
				+    "    if (output_counter >= output_freq) {\n",
			
 
				+    "      output_counter = output_counter - output_freq;\n",
			
 
				+    "      output(state,etime);\n",
			
 
				+    "    }\n",
			
 
				+    "  }\n",
			
 
				+    "  \n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "At every time step the direction is reversed to get second order derivative.\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Time_Step.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "```cpp\n",
			
 
				+    "void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {\n",
			
 
				+    "  if (direction_switch) {\n",
			
 
				+    "    //x-direction first\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
			
 
				+    "    //z-direction second\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
			
 
				+    "  } else {\n",
			
 
				+    "    //z-direction second\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
			
 
				+    "    //x-direction first\n",
			
 
				+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
			
 
				+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
			
 
				+    "  }\n",
			
 
				+    "  if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; }\n",
			
 
				+    "}\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/Semi_Discrete.jpg\" width=\"70%\" height=\"70%\">\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab1.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab1.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab2.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab2.ipynb
@@ -0,0 +1,184 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "In this lab, we will optimize the weather simulation application written in C++ (if you prefer to use Fortran, click [this link](../../Fortran/jupyter_notebook/profiling-fortran.ipynb)). \n",
			
 
				+    "\n",
			
 
				+    "Let's execute the cell below to display information about the GPUs running on the server by running the pgaccelinfo command, which ships with the PGI compiler that we will be using. To do this, execute the cell block below by giving it focus (clicking on it with your mouse), and hitting Ctrl-Enter, or pressing the play button in the toolbar above. If all goes well, you should see some output returned below the grey cell."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!pgaccelinfo"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Exercise 2 \n",
			
 
				+    "\n",
			
 
				+    "### Learning objectives\n",
			
 
				+    "Learn how to identify and parallelise the computationally expensive routines in your application using OpenACC compute constructs (A compute construct is a parallel, kernels, or serial construct.). In this exercise you will:\n",
			
 
				+    "\n",
			
 
				+    "- Implement OpenACC parallelism using parallel directives to parallelise the serial application\n",
			
 
				+    "- Learn how to compile your parallel application with PGI compiler\n",
			
 
				+    "- Benchmark and compare the parallel version of the application with the serial version\n",
			
 
				+    "- Learn how to interpret PGI compiler feedback to ensure the applied optimization were successful"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "From the top menu, click on *File*, and *Open* `miniWeather_openacc.cpp` and `Makefile` from the current directory at `English/C/source_code/lab2` directory and inspect the code before running below cells. We have already added OpenACC compute directives (`#pragma acc parallel`) around the expensive routines (loops) in the code.\n",
			
 
				+    "\n",
			
 
				+    "Once done, compile the code with `make`. View the PGI compiler feedback (enabled by adding `-Minfo=accel` flag) and investigate the compiler feedback for the OpenACC code. The compiler feedback provides useful information about applied optimizations."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../source_code/lab2 && make"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's inspect part of the compiler feedback and see what it's telling us.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/cfeedback1.png\">\n",
			
 
				+    "\n",
			
 
				+    "- Using `-ta=tesla:managed`, instruct the compiler to build for an NVIDIA Tesla GPU using \"CUDA Managed Memory\"\n",
			
 
				+    "- Using `-Minfo` command-line option, we will see all output from the compiler. In this example, we use `-Minfo=accel` to only see the output corresponding to the accelerator (in this case an NVIDIA GPU).\n",
			
 
				+    "- The first line of the output, `compute_tendencies_x`, tells us which function the following information is in reference to.\n",
			
 
				+    "- The line starting with 227, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `277, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
			
 
				+    "- The rest of the information concerns data movement. Compiler detected possible need to move data and handled it for us. We will get into this later in this lab.\n",
			
 
				+    "\n",
			
 
				+    "It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it.\n",
			
 
				+    "\n",
			
 
				+    "Now, **Run** the application for small values of `nx_glob`,`nz_glob`, and `sim_time`: **400, 200, 10**"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!cd ../source_code/lab2 && nsys profile -t nvtx --stats=true --force-overwrite true -o miniWeather_3 ./miniWeather 400 200 10"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "You can see that the changes made actually slowed down the code and it runs slower compared to the non-accelerated CPU only version. Let's checkout the profiler's report. [Download the profiler output](../source_code/lab2/miniWeather_3.qdrep) and open it via the GUI. \n",
			
 
				+    "\n",
			
 
				+    "From the \"timeline view\" on the top pane, double click on the \"CUDA\" from the function table on the left and expand it. Zoom in on the timeline and you can see a pattern similar to the screenshot below. The blue boxes are the compute kernels and each of these groupings of kernels is surrounded by purple and teal boxes (annotated with red color) representing data movements. \n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/nsys_slow.png\" width=\"80%\" height=\"80%\">\n",
			
 
				+    "\n",
			
 
				+    "Let's hover your mouse over kernels (blue boxes) one by one from each row and checkout the provided information.\n",
			
 
				+    "\n",
			
 
				+    "<img src=\"images/occu-1.png\" width=\"60%\" height=\"60%\">\n",
			
 
				+    "\n",
			
 
				+    "**Note**: In the next two exercises, we start optimizing the application by improving the occupancy and reducing data movements."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Post-Lab Summary\n",
			
 
				+    "\n",
			
 
				+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%%bash\n",
			
 
				+    "cd ..\n",
			
 
				+    "rm -f openacc_profiler_files.zip\n",
			
 
				+    "zip -r openacc_profiler_files.zip *"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**After** executing the above zip command, you should be able to download the zip file [here](../openacc_profiler_files.zip)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "-----\n",
			
 
				+    "\n",
			
 
				+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../profiling_start.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling-c-lab3.ipynb>NEXT</a></span> </p>\n",
			
 
				+    "\n",
			
 
				+    "-----"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Links and Resources\n",
			
 
				+    "\n",
			
 
				+    "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
			
 
				+    "\n",
			
 
				+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
			
 
				+    "\n",
			
 
				+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
			
 
				+    "\n",
			
 
				+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
			
 
				+    "\n",
			
 
				+    "--- \n",
			
 
				+    "\n",
			
 
				+    "## Licensing \n",
			
 
				+    "\n",
			
 
				+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.7.4"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab3.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab3.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab4.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab4.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab5.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab5.ipynb
--- a/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c.ipynb
+++ b/hpc/miniprofiler/English/C/jupyter_notebook/profiling-c.ipynb
--- a/hpc/miniprofiler/English/C/source_code/lab1/.ipynb_checkpoints/profiling-c-lab1-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/source_code/lab1/.ipynb_checkpoints/profiling-c-lab1-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/source_code/lab1/Makefile
+++ b/hpc/miniprofiler/English/C/source_code/lab1/Makefile
@@ -0,0 +1,11 @@
 
				+CC := pgc++
			
 
				+CFLAGS := -O3 -w
			
 
				+ACCFLAGS := -Minfo=accel
			
 
				+LDFLAGS :=  -lnetcdf -ldl
			
 
				+NVTXLIB := -I/opt/pgi/linux86-64-llvm/2019/cuda/10.1/include/
			
 
				+
			
 
				+miniWeather: miniWeather_serial.cpp
			
 
				+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_serial.cpp ${NVTXLIB} ${LDFLAGS}
			
 
				+
			
 
				+clean:
			
 
				+	rm -f *.o miniWeather
			
--- a/hpc/miniprofiler/English/C/source_code/lab1/miniWeather_serial.cpp
+++ b/hpc/miniprofiler/English/C/source_code/lab1/miniWeather_serial.cpp
@@ -0,0 +1,762 @@
 
				+//////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// miniWeather
			
 
				+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
			
 
				+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
			
 
				+// For documentation, please see the attached documentation in the "documentation" folder
			
 
				+//////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <stdio.h>
			
 
				+#include <netcdf.h>
			
 
				+#include <nvtx3/nvToolsExt.h>
			
 
				+
			
 
				+const double pi = 3.14159265358979323846264338327;   //Pi
			
 
				+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
			
 
				+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
			
 
				+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
			
 
				+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
			
 
				+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
			
 
				+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
			
 
				+//Define domain and stability-related constants
			
 
				+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
			
 
				+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
			
 
				+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
			
 
				+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
			
 
				+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
			
 
				+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
			
 
				+const int sten_size = 4;      //Size of the stencil used for interpolation
			
 
				+
			
 
				+//Parameters for indexing and flags
			
 
				+const int NUM_VARS = 4; //Number of fluid state variables
			
 
				+const int ID_DENS = 0;  //index for density ("rho")
			
 
				+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
			
 
				+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
			
 
				+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
			
 
				+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
			
 
				+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
			
 
				+
			
 
				+const int nqpoints = 3;
			
 
				+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
			
 
				+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Variables that are initialized but remain static over the course of the simulation
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+double sim_time;            //total simulation time in seconds
			
 
				+double output_freq;         //frequency to perform output in seconds
			
 
				+double dt;                  //Model time step (seconds)
			
 
				+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
			
 
				+double dx, dz;              //Grid space length in x- and z-dimension (meters)
			
 
				+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
			
 
				+int i_beg, k_beg;           //beginning index in the x- and z-directions
			
 
				+int nranks, myrank;         //my rank id
			
 
				+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
			
 
				+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
			
 
				+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
			
 
				+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
			
 
				+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
			
 
				+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Variables that are dynamics over the course of the simulation
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+double etime;          //Elapsed model time
			
 
				+double output_counter; //Helps determine when it's time to do output
			
 
				+//Runtime variable arrays
			
 
				+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
			
 
				+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
			
 
				+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
			
 
				+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
			
 
				+int num_out = 0;   //The number of outputs performed so far
			
 
				+int direction_switch = 1;
			
 
				+
			
 
				+//How is this not in the standard?!
			
 
				+double dmin(double a, double b)
			
 
				+{
			
 
				+  if (a < b)
			
 
				+  {
			
 
				+    return a;
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    return b;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+//Declaring the functions defined after "main"
			
 
				+void init();
			
 
				+void finalize();
			
 
				+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
			
 
				+void hydro_const_theta(double z, double &r, double &t);
			
 
				+void output(double *state, double etime);
			
 
				+void ncwrap(int ierr, int line);
			
 
				+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
			
 
				+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
			
 
				+void compute_tendencies_x(double *state, double *flux, double *tend);
			
 
				+void compute_tendencies_z(double *state, double *flux, double *tend);
			
 
				+void set_halo_values_x(double *state);
			
 
				+void set_halo_values_z(double *state);
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// THE MAIN PROGRAM STARTS HERE
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  // BEGIN USER-CONFIGURABLE PARAMETERS
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  //The x-direction length is twice as long as the z-direction length
			
 
				+  //So, you'll want to have nx_glob be twice as large as nz_glob
			
 
				+  nx_glob = 400;     //Number of total cells in the x-dirction
			
 
				+  nz_glob = 200;     //Number of total cells in the z-dirction
			
 
				+  sim_time = 1500;   //How many seconds to run the simulation
			
 
				+  output_freq = 100; //How frequently to output data to file (in seconds)
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  // END USER-CONFIGURABLE PARAMETERS
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+  if (argc == 4)
			
 
				+  {
			
 
				+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
			
 
				+    nx_glob = atoi(argv[1]);
			
 
				+    nz_glob = atoi(argv[2]);
			
 
				+    sim_time = atoi(argv[3]);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    printf("Using default values ...\n");
			
 
				+  }
			
 
				+  nvtxRangePushA("Total");
			
 
				+  init();
			
 
				+
			
 
				+  //Output the initial state
			
 
				+  output(state, etime);
			
 
				+
			
 
				+  ////////////////////////////////////////////////////
			
 
				+  // MAIN TIME STEP LOOP
			
 
				+  ////////////////////////////////////////////////////
			
 
				+
			
 
				+  nvtxRangePushA("while");
			
 
				+  while (etime < sim_time)
			
 
				+  {
			
 
				+    //If the time step leads to exceeding the simulation time, shorten it for the last step
			
 
				+    if (etime + dt > sim_time)
			
 
				+    {
			
 
				+      dt = sim_time - etime;
			
 
				+    }
			
 
				+
			
 
				+    //Perform a single time step
			
 
				+    nvtxRangePushA("perform_timestep");
			
 
				+    perform_timestep(state, state_tmp, flux, tend, dt);
			
 
				+    nvtxRangePop();
			
 
				+
			
 
				+    //Inform the user
			
 
				+
			
 
				+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
			
 
				+
			
 
				+    //Update the elapsed time and output counter
			
 
				+    etime = etime + dt;
			
 
				+    output_counter = output_counter + dt;
			
 
				+    //If it's time for output, reset the counter, and do output
			
 
				+
			
 
				+    if (output_counter >= output_freq)
			
 
				+    {
			
 
				+      output_counter = output_counter - output_freq;
			
 
				+      output(state, etime);
			
 
				+    }
			
 
				+  }
			
 
				+  nvtxRangePop();
			
 
				+  finalize();
			
 
				+  nvtxRangePop();
			
 
				+}
			
 
				+
			
 
				+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
			
 
				+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
			
 
				+//order of directions is alternated each time step.
			
 
				+//The Runge-Kutta method used here is defined as follows:
			
 
				+// q*     = q[n] + dt/3 * rhs(q[n])
			
 
				+// q**    = q[n] + dt/2 * rhs(q*  )
			
 
				+// q[n+1] = q[n] + dt/1 * rhs(q** )
			
 
				+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
			
 
				+{
			
 
				+  if (direction_switch)
			
 
				+  {
			
 
				+    //x-direction first
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
			
 
				+    //z-direction second
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    //z-direction second
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
			
 
				+    //x-direction first
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
			
 
				+  }
			
 
				+  if (direction_switch)
			
 
				+  {
			
 
				+    direction_switch = 0;
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    direction_switch = 1;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Perform a single semi-discretized step in time with the form:
			
 
				+//state_out = state_init + dt * rhs(state_forcing)
			
 
				+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
			
 
				+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, inds, indt;
			
 
				+  if (dir == DIR_X)
			
 
				+  {
			
 
				+    //Set the halo values  in the x-direction
			
 
				+    set_halo_values_x(state_forcing);
			
 
				+    //Compute the time tendencies for the fluid state in the x-direction
			
 
				+    compute_tendencies_x(state_forcing, flux, tend);
			
 
				+  }
			
 
				+  else if (dir == DIR_Z)
			
 
				+  {
			
 
				+    //Set the halo values  in the z-direction
			
 
				+    set_halo_values_z(state_forcing);
			
 
				+    //Compute the time tendencies for the fluid state in the z-direction
			
 
				+    compute_tendencies_z(state_forcing, flux, tend);
			
 
				+  }
			
 
				+
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Apply the tendencies to the fluid state
			
 
				+
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        state_out[inds] = state_init[inds] + dt * tend[indt];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Compute the time tendencies of the fluid state using forcing in the x-direction
			
 
				+
			
 
				+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
			
 
				+//Then, compute the tendencies using those fluxes
			
 
				+void compute_tendencies_x(double *state, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, s, inds, indf1, indf2, indt;
			
 
				+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
			
 
				+  //Compute the hyperviscosity coeficient
			
 
				+  hv_coef = -hv_beta * dx / (16 * dt);
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Compute fluxes in the x-direction for each cell
			
 
				+
			
 
				+  for (k = 0; k < nz; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 1; i++)
			
 
				+    {
			
 
				+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        for (s = 0; s < sten_size; s++)
			
 
				+        {
			
 
				+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
			
 
				+          stencil[s] = state[inds];
			
 
				+        }
			
 
				+        //Fourth-order-accurate interpolation of the state
			
 
				+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
			
 
				+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
			
 
				+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
			
 
				+      }
			
 
				+
			
 
				+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
			
 
				+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
			
 
				+      u = vals[ID_UMOM] / r;
			
 
				+      w = vals[ID_WMOM] / r;
			
 
				+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
			
 
				+      p = C0 * pow((r * t), gamm);
			
 
				+
			
 
				+      //Compute the flux vector
			
 
				+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
			
 
				+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
			
 
				+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
			
 
				+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Use the fluxes to compute tendencies for each cell
			
 
				+
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
			
 
				+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
			
 
				+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Compute the time tendencies of the fluid state using forcing in the z-direction
			
 
				+
			
 
				+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
			
 
				+//Then, compute the tendencies using those fluxes
			
 
				+void compute_tendencies_z(double *state, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, s, inds, indf1, indf2, indt;
			
 
				+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
			
 
				+  //Compute the hyperviscosity coeficient
			
 
				+  hv_coef = -hv_beta * dx / (16 * dt);
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Compute fluxes in the x-direction for each cell
			
 
				+
			
 
				+  for (k = 0; k < nz + 1; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+    {
			
 
				+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        for (s = 0; s < sten_size; s++)
			
 
				+        {
			
 
				+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
			
 
				+          stencil[s] = state[inds];
			
 
				+        }
			
 
				+        //Fourth-order-accurate interpolation of the state
			
 
				+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
			
 
				+        //First-order-accurate interpolation of the third spatial derivative of the state
			
 
				+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
			
 
				+      }
			
 
				+
			
 
				+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
			
 
				+      r = vals[ID_DENS] + hy_dens_int[k];
			
 
				+      u = vals[ID_UMOM] / r;
			
 
				+      w = vals[ID_WMOM] / r;
			
 
				+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
			
 
				+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
			
 
				+
			
 
				+      //Compute the flux vector with hyperviscosity
			
 
				+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
			
 
				+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
			
 
				+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
			
 
				+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Use the fluxes to compute tendencies for each cell
			
 
				+
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
			
 
				+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
			
 
				+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
			
 
				+        if (ll == ID_WMOM)
			
 
				+        {
			
 
				+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+          tend[indt] = tend[indt] - state[inds] * grav;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void set_halo_values_x(double *state)
			
 
				+{
			
 
				+  int k, ll, ind_r, ind_u, ind_t, i;
			
 
				+  double z;
			
 
				+
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
			
 
				+    }
			
 
				+  }
			
 
				+  ////////////////////////////////////////////////////
			
 
				+
			
 
				+  if (myrank == 0)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < hs; i++)
			
 
				+      {
			
 
				+        z = (k_beg + k + 0.5) * dz;
			
 
				+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
			
 
				+        {
			
 
				+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
			
 
				+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Set this task's halo values in the z-direction.
			
 
				+//decomposition in the vertical direction.
			
 
				+void set_halo_values_z(double *state)
			
 
				+{
			
 
				+  int i, ll;
			
 
				+  const double mnt_width = xlen / 8;
			
 
				+  double x, xloc, mnt_deriv;
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 2 * hs; i++)
			
 
				+    {
			
 
				+      if (ll == ID_WMOM)
			
 
				+      {
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
			
 
				+      }
			
 
				+      else
			
 
				+      {
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void init()
			
 
				+{
			
 
				+  int i, k, ii, kk, ll, inds, i_end;
			
 
				+  double x, z, r, u, w, t, hr, ht, nper;
			
 
				+
			
 
				+  //Set the cell grid size
			
 
				+  dx = xlen / nx_glob;
			
 
				+  dz = zlen / nz_glob;
			
 
				+
			
 
				+  nranks = 1;
			
 
				+  myrank = 0;
			
 
				+
			
 
				+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
			
 
				+
			
 
				+  nper = ((double)nx_glob) / nranks;
			
 
				+  i_beg = round(nper * (myrank));
			
 
				+  i_end = round(nper * ((myrank) + 1)) - 1;
			
 
				+  nx = i_end - i_beg + 1;
			
 
				+  left_rank = myrank - 1;
			
 
				+  if (left_rank == -1)
			
 
				+    left_rank = nranks - 1;
			
 
				+  right_rank = myrank + 1;
			
 
				+  if (right_rank == nranks)
			
 
				+    right_rank = 0;
			
 
				+
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+  k_beg = 0;
			
 
				+  nz = nz_glob;
			
 
				+
			
 
				+  //Allocate the model data
			
 
				+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
			
 
				+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
			
 
				+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
			
 
				+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
			
 
				+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
			
 
				+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
			
 
				+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+
			
 
				+  //Define the maximum stable time step based on an assumed maximum wind speed
			
 
				+  dt = dmin(dx, dz) / max_speed * cfl;
			
 
				+  //Set initial elapsed model time and output_counter to zero
			
 
				+  etime = 0.;
			
 
				+  output_counter = 0.;
			
 
				+
			
 
				+  // Display grid information
			
 
				+
			
 
				+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
			
 
				+  printf("dx,dz: %lf %lf\n", dx, dz);
			
 
				+  printf("dt: %lf\n", dt);
			
 
				+
			
 
				+  //////////////////////////////////////////////////////////////////////////
			
 
				+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
			
 
				+  //////////////////////////////////////////////////////////////////////////
			
 
				+  for (k = 0; k < nz + 2 * hs; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 2 * hs; i++)
			
 
				+    {
			
 
				+      //Initialize the state to zero
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+        state[inds] = 0.;
			
 
				+      }
			
 
				+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
			
 
				+      for (kk = 0; kk < nqpoints; kk++)
			
 
				+      {
			
 
				+        for (ii = 0; ii < nqpoints; ii++)
			
 
				+        {
			
 
				+          //Compute the x,z location within the global domain based on cell and quadrature index
			
 
				+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
			
 
				+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
			
 
				+
			
 
				+          //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+          injection(x, z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+          //Store into the fluid state array
			
 
				+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
			
 
				+        }
			
 
				+      }
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+        state_tmp[inds] = state[inds];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  //Compute the hydrostatic background state over vertical cell averages
			
 
				+  for (k = 0; k < nz + 2 * hs; k++)
			
 
				+  {
			
 
				+    hy_dens_cell[k] = 0.;
			
 
				+    hy_dens_theta_cell[k] = 0.;
			
 
				+    for (kk = 0; kk < nqpoints; kk++)
			
 
				+    {
			
 
				+      z = (k_beg + k - hs + 0.5) * dz;
			
 
				+
			
 
				+      //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+      injection(0., z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
			
 
				+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
			
 
				+    }
			
 
				+  }
			
 
				+  //Compute the hydrostatic background state at vertical cell interfaces
			
 
				+  for (k = 0; k < nz + 1; k++)
			
 
				+  {
			
 
				+    z = (k_beg + k) * dz;
			
 
				+
			
 
				+    //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+    injection(0., z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+    hy_dens_int[k] = hr;
			
 
				+    hy_dens_theta_int[k] = hr * ht;
			
 
				+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
			
 
				+//x and z are input coordinates at which to sample
			
 
				+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
			
 
				+//hr and ht are output background hydrostatic density and potential temperature at that location
			
 
				+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
			
 
				+{
			
 
				+  hydro_const_theta(z, hr, ht);
			
 
				+  r = 0.;
			
 
				+  t = 0.;
			
 
				+  u = 0.;
			
 
				+  w = 0.;
			
 
				+}
			
 
				+
			
 
				+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
			
 
				+//z is the input coordinate
			
 
				+//r and t are the output background hydrostatic density and potential temperature
			
 
				+void hydro_const_theta(double z, double &r, double &t)
			
 
				+{
			
 
				+  const double theta0 = 300.; //Background potential temperature
			
 
				+  const double exner0 = 1.;   //Surface-level Exner pressure
			
 
				+  double p, exner, rt;
			
 
				+  //Establish hydrostatic balance first using Exner pressure
			
 
				+  t = theta0;                                //Potential Temperature at z
			
 
				+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
			
 
				+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
			
 
				+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
			
 
				+  r = rt / t;                                //Density at z
			
 
				+}
			
 
				+
			
 
				+//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime)
			
 
				+//The file I/O uses netcdf, the only external library required for this mini-app.
			
 
				+//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
			
 
				+void output(double *state, double etime)
			
 
				+{
			
 
				+  int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
			
 
				+  int i, k, ind_r, ind_u, ind_w, ind_t;
			
 
				+
			
 
				+  size_t st1[1], ct1[1], st3[3], ct3[3];
			
 
				+
			
 
				+  //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
			
 
				+  double *dens, *uwnd, *wwnd, *theta;
			
 
				+  double *etimearr;
			
 
				+  //Inform the user
			
 
				+
			
 
				+  printf("*** OUTPUT ***\n");
			
 
				+
			
 
				+  //Allocate some (big) temp arrays
			
 
				+  dens = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  uwnd = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  wwnd = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  theta = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  etimearr = (double *)malloc(1 * sizeof(double));
			
 
				+
			
 
				+  //If the elapsed time is zero, create the file. Otherwise, open the file
			
 
				+  if (etime == 0)
			
 
				+  {
			
 
				+    //Create the file
			
 
				+    ncwrap(nc_create("reference.nc", NC_CLOBBER, &ncid), __LINE__);
			
 
				+
			
 
				+    //Create the dimensions
			
 
				+    ncwrap(nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid), __LINE__);
			
 
				+    ncwrap(nc_def_dim(ncid, "x", nx_glob, &x_dimid), __LINE__);
			
 
				+    ncwrap(nc_def_dim(ncid, "z", nz_glob, &z_dimid), __LINE__);
			
 
				+
			
 
				+    //Create the variables
			
 
				+    dimids[0] = t_dimid;
			
 
				+    ncwrap(nc_def_var(ncid, "t", NC_DOUBLE, 1, dimids, &t_varid), __LINE__);
			
 
				+
			
 
				+    dimids[0] = t_dimid;
			
 
				+    dimids[1] = z_dimid;
			
 
				+    dimids[2] = x_dimid;
			
 
				+
			
 
				+    ncwrap(nc_def_var(ncid, "dens", NC_DOUBLE, 3, dimids, &dens_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "uwnd", NC_DOUBLE, 3, dimids, &uwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "wwnd", NC_DOUBLE, 3, dimids, &wwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "theta", NC_DOUBLE, 3, dimids, &theta_varid), __LINE__);
			
 
				+
			
 
				+    //End "define" mode
			
 
				+    ncwrap(nc_enddef(ncid), __LINE__);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    //Open the file
			
 
				+    ncwrap(nc_open("reference.nc", NC_WRITE, &ncid), __LINE__);
			
 
				+
			
 
				+    //Get the variable IDs
			
 
				+    ncwrap(nc_inq_varid(ncid, "dens", &dens_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "uwnd", &uwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "wwnd", &wwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "theta", &theta_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "t", &t_varid), __LINE__);
			
 
				+  }
			
 
				+
			
 
				+  //Store perturbed values in the temp arrays for output
			
 
				+  for (k = 0; k < nz; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+    {
			
 
				+      ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_w = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      dens[k * nx + i] = state[ind_r];
			
 
				+      uwnd[k * nx + i] = state[ind_u] / (hy_dens_cell[k + hs] + state[ind_r]);
			
 
				+      wwnd[k * nx + i] = state[ind_w] / (hy_dens_cell[k + hs] + state[ind_r]);
			
 
				+      theta[k * nx + i] = (state[ind_t] + hy_dens_theta_cell[k + hs]) / (hy_dens_cell[k + hs] + state[ind_r]) - hy_dens_theta_cell[k + hs] / hy_dens_cell[k + hs];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  //Write the grid data to file with all the processes writing collectively
			
 
				+  st3[0] = num_out;
			
 
				+  st3[1] = k_beg;
			
 
				+  st3[2] = i_beg;
			
 
				+  ct3[0] = 1;
			
 
				+  ct3[1] = nz;
			
 
				+  ct3[2] = nx;
			
 
				+
			
 
				+  ncwrap(nc_put_vara_double(ncid, dens_varid, st3, ct3, dens), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, uwnd_varid, st3, ct3, uwnd), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, wwnd_varid, st3, ct3, wwnd), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, theta_varid, st3, ct3, theta), __LINE__);
			
 
				+
			
 
				+  //Only the master process needs to write the elapsed time
			
 
				+  //write elapsed time to file
			
 
				+
			
 
				+  st1[0] = num_out;
			
 
				+  ct1[0] = 1;
			
 
				+  etimearr[0] = etime;
			
 
				+  ncwrap(nc_put_vara_double(ncid, t_varid, st1, ct1, etimearr), __LINE__);
			
 
				+
			
 
				+  //Close the file
			
 
				+  ncwrap(nc_close(ncid), __LINE__);
			
 
				+
			
 
				+  //Increment the number of outputs
			
 
				+  num_out = num_out + 1;
			
 
				+
			
 
				+  //Deallocate the temp arrays
			
 
				+  free(dens);
			
 
				+  free(uwnd);
			
 
				+  free(wwnd);
			
 
				+  free(theta);
			
 
				+  free(etimearr);
			
 
				+}
			
 
				+
			
 
				+//Error reporting routine for the NetCDF I/O
			
 
				+void ncwrap(int ierr, int line)
			
 
				+{
			
 
				+  if (ierr != NC_NOERR)
			
 
				+  {
			
 
				+    printf("NetCDF Error at line: %d\n", line);
			
 
				+    printf("%s\n", nc_strerror(ierr));
			
 
				+    exit(-1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void finalize()
			
 
				+{
			
 
				+  free(state);
			
 
				+  free(state_tmp);
			
 
				+  free(flux);
			
 
				+  free(tend);
			
 
				+  free(hy_dens_cell);
			
 
				+  free(hy_dens_theta_cell);
			
 
				+  free(hy_dens_int);
			
 
				+  free(hy_dens_theta_int);
			
 
				+  free(hy_pressure_int);
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/source_code/lab2/.ipynb_checkpoints/profiling-c-lab2-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/source_code/lab2/.ipynb_checkpoints/profiling-c-lab2-checkpoint.ipynb
--- a/hpc/miniprofiler/English/C/source_code/lab2/Makefile
+++ b/hpc/miniprofiler/English/C/source_code/lab2/Makefile
@@ -0,0 +1,10 @@
 
				+CC := pgc++
			
 
				+CFLAGS := -O3 -w
			
 
				+ACCFLAGS := -ta=tesla:managed -Minfo=accel
			
 
				+LDFLAGS := -lnetcdf
			
 
				+
			
 
				+miniWeather: miniWeather_openacc.cpp
			
 
				+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp ${LDFLAGS}
			
 
				+
			
 
				+clean:
			
 
				+	rm -f *.o miniWeather
			
--- a/hpc/miniprofiler/English/C/source_code/lab2/miniWeather_openacc.cpp
+++ b/hpc/miniprofiler/English/C/source_code/lab2/miniWeather_openacc.cpp
@@ -0,0 +1,766 @@
 
				+//////////////////////////////////////////////////////////////////////////////////////////
			
 
				+// miniWeather
			
 
				+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
			
 
				+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
			
 
				+// For documentation, please see the attached documentation in the "documentation" folder
			
 
				+//////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <stdio.h>
			
 
				+#include <netcdf.h>
			
 
				+#include <nvtx3/nvToolsExt.h>
			
 
				+
			
 
				+const double pi = 3.14159265358979323846264338327;   //Pi
			
 
				+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
			
 
				+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
			
 
				+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
			
 
				+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
			
 
				+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
			
 
				+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
			
 
				+//Define domain and stability-related constants
			
 
				+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
			
 
				+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
			
 
				+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
			
 
				+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
			
 
				+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
			
 
				+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
			
 
				+const int sten_size = 4;      //Size of the stencil used for interpolation
			
 
				+
			
 
				+//Parameters for indexing and flags
			
 
				+const int NUM_VARS = 4; //Number of fluid state variables
			
 
				+const int ID_DENS = 0;  //index for density ("rho")
			
 
				+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
			
 
				+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
			
 
				+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
			
 
				+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
			
 
				+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
			
 
				+
			
 
				+const int nqpoints = 3;
			
 
				+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
			
 
				+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Variables that are initialized but remain static over the course of the simulation
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+double sim_time;            //total simulation time in seconds
			
 
				+double output_freq;         //frequency to perform output in seconds
			
 
				+double dt;                  //Model time step (seconds)
			
 
				+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
			
 
				+double dx, dz;              //Grid space length in x- and z-dimension (meters)
			
 
				+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
			
 
				+int i_beg, k_beg;           //beginning index in the x- and z-directions
			
 
				+int nranks, myrank;         //my rank id
			
 
				+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
			
 
				+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
			
 
				+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
			
 
				+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
			
 
				+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
			
 
				+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// Variables that are dynamics over the course of the simulation
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+double etime;          //Elapsed model time
			
 
				+double output_counter; //Helps determine when it's time to do output
			
 
				+//Runtime variable arrays
			
 
				+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
			
 
				+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
			
 
				+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
			
 
				+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
			
 
				+int num_out = 0;   //The number of outputs performed so far
			
 
				+int direction_switch = 1;
			
 
				+
			
 
				+//How is this not in the standard?!
			
 
				+double dmin(double a, double b)
			
 
				+{
			
 
				+  if (a < b)
			
 
				+  {
			
 
				+    return a;
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    return b;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+//Declaring the functions defined after "main"
			
 
				+void init();
			
 
				+void finalize();
			
 
				+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
			
 
				+void hydro_const_theta(double z, double &r, double &t);
			
 
				+void output(double *state, double etime);
			
 
				+void ncwrap(int ierr, int line);
			
 
				+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
			
 
				+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
			
 
				+void compute_tendencies_x(double *state, double *flux, double *tend);
			
 
				+void compute_tendencies_z(double *state, double *flux, double *tend);
			
 
				+void set_halo_values_x(double *state);
			
 
				+void set_halo_values_z(double *state);
			
 
				+
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+// THE MAIN PROGRAM STARTS HERE
			
 
				+///////////////////////////////////////////////////////////////////////////////////////
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  // BEGIN USER-CONFIGURABLE PARAMETERS
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  //The x-direction length is twice as long as the z-direction length
			
 
				+  //So, you'll want to have nx_glob be twice as large as nz_glob
			
 
				+  nx_glob = 400;     //Number of total cells in the x-dirction
			
 
				+  nz_glob = 200;     //Number of total cells in the z-dirction
			
 
				+  sim_time = 1500;   //How many seconds to run the simulation
			
 
				+  output_freq = 100; //How frequently to output data to file (in seconds)
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+  // END USER-CONFIGURABLE PARAMETERS
			
 
				+  ///////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+  if (argc == 4)
			
 
				+  {
			
 
				+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
			
 
				+    nx_glob = atoi(argv[1]);
			
 
				+    nz_glob = atoi(argv[2]);
			
 
				+    sim_time = atoi(argv[3]);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    printf("Using default values ...\n");
			
 
				+  }
			
 
				+
			
 
				+  nvtxRangePushA("Total");
			
 
				+  init();
			
 
				+
			
 
				+  //Output the initial state
			
 
				+  output(state, etime);
			
 
				+
			
 
				+  ////////////////////////////////////////////////////
			
 
				+  // MAIN TIME STEP LOOP
			
 
				+  ////////////////////////////////////////////////////
			
 
				+
			
 
				+  nvtxRangePushA("while");
			
 
				+  while (etime < sim_time)
			
 
				+  {
			
 
				+    //If the time step leads to exceeding the simulation time, shorten it for the last step
			
 
				+    if (etime + dt > sim_time)
			
 
				+    {
			
 
				+      dt = sim_time - etime;
			
 
				+    }
			
 
				+
			
 
				+    //Perform a single time step
			
 
				+    nvtxRangePushA("perform_timestep");
			
 
				+    perform_timestep(state, state_tmp, flux, tend, dt);
			
 
				+    nvtxRangePop();
			
 
				+
			
 
				+    //Inform the user
			
 
				+
			
 
				+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
			
 
				+
			
 
				+    //Update the elapsed time and output counter
			
 
				+    etime = etime + dt;
			
 
				+    output_counter = output_counter + dt;
			
 
				+    //If it's time for output, reset the counter, and do output
			
 
				+
			
 
				+    if (output_counter >= output_freq)
			
 
				+    {
			
 
				+      output_counter = output_counter - output_freq;
			
 
				+
			
 
				+      output(state, etime);
			
 
				+    }
			
 
				+  }
			
 
				+  nvtxRangePop();
			
 
				+
			
 
				+  finalize();
			
 
				+  nvtxRangePop();
			
 
				+}
			
 
				+
			
 
				+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
			
 
				+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
			
 
				+//order of directions is alternated each time step.
			
 
				+//The Runge-Kutta method used here is defined as follows:
			
 
				+// q*     = q[n] + dt/3 * rhs(q[n])
			
 
				+// q**    = q[n] + dt/2 * rhs(q*  )
			
 
				+// q[n+1] = q[n] + dt/1 * rhs(q** )
			
 
				+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
			
 
				+{
			
 
				+  if (direction_switch)
			
 
				+  {
			
 
				+    //x-direction first
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
			
 
				+    //z-direction second
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    //z-direction second
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
			
 
				+    //x-direction first
			
 
				+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
			
 
				+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
			
 
				+  }
			
 
				+  if (direction_switch)
			
 
				+  {
			
 
				+    direction_switch = 0;
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    direction_switch = 1;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Perform a single semi-discretized step in time with the form:
			
 
				+//state_out = state_init + dt * rhs(state_forcing)
			
 
				+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
			
 
				+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, inds, indt;
			
 
				+  if (dir == DIR_X)
			
 
				+  {
			
 
				+    //Set the halo values  in the x-direction
			
 
				+    set_halo_values_x(state_forcing);
			
 
				+    //Compute the time tendencies for the fluid state in the x-direction
			
 
				+    compute_tendencies_x(state_forcing, flux, tend);
			
 
				+  }
			
 
				+  else if (dir == DIR_Z)
			
 
				+  {
			
 
				+    //Set the halo values  in the z-direction
			
 
				+    set_halo_values_z(state_forcing);
			
 
				+    //Compute the time tendencies for the fluid state in the z-direction
			
 
				+    compute_tendencies_z(state_forcing, flux, tend);
			
 
				+  }
			
 
				+
			
 
				+/////////////////////////////////////////////////
			
 
				+// TODO: THREAD ME
			
 
				+/////////////////////////////////////////////////
			
 
				+//Apply the tendencies to the fluid state
			
 
				+#pragma acc parallel loop private(inds, indt) default(present)
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        state_out[inds] = state_init[inds] + dt * tend[indt];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Compute the time tendencies of the fluid state using forcing in the x-direction
			
 
				+
			
 
				+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
			
 
				+//Then, compute the tendencies using those fluxes
			
 
				+void compute_tendencies_x(double *state, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, s, inds, indf1, indf2, indt;
			
 
				+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
			
 
				+  //Compute the hyperviscosity coeficient
			
 
				+  hv_coef = -hv_beta * dx / (16 * dt);
			
 
				+  /////////////////////////////////////////////////
			
 
				+  // TODO: THREAD ME
			
 
				+  /////////////////////////////////////////////////
			
 
				+  //Compute fluxes in the x-direction for each cell
			
 
				+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
			
 
				+  for (k = 0; k < nz; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 1; i++)
			
 
				+    {
			
 
				+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        for (s = 0; s < sten_size; s++)
			
 
				+        {
			
 
				+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
			
 
				+          stencil[s] = state[inds];
			
 
				+        }
			
 
				+        //Fourth-order-accurate interpolation of the state
			
 
				+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
			
 
				+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
			
 
				+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
			
 
				+      }
			
 
				+
			
 
				+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
			
 
				+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
			
 
				+      u = vals[ID_UMOM] / r;
			
 
				+      w = vals[ID_WMOM] / r;
			
 
				+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
			
 
				+      p = C0 * pow((r * t), gamm);
			
 
				+
			
 
				+      //Compute the flux vector
			
 
				+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
			
 
				+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
			
 
				+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
			
 
				+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+/////////////////////////////////////////////////
			
 
				+// TODO: THREAD ME
			
 
				+/////////////////////////////////////////////////
			
 
				+//Use the fluxes to compute tendencies for each cell
			
 
				+#pragma acc parallel loop private(indt, indf1, indf2)
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
			
 
				+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
			
 
				+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Compute the time tendencies of the fluid state using forcing in the z-direction
			
 
				+
			
 
				+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
			
 
				+//Then, compute the tendencies using those fluxes
			
 
				+void compute_tendencies_z(double *state, double *flux, double *tend)
			
 
				+{
			
 
				+  int i, k, ll, s, inds, indf1, indf2, indt;
			
 
				+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
			
 
				+  //Compute the hyperviscosity coeficient
			
 
				+  hv_coef = -hv_beta * dx / (16 * dt);
			
 
				+/////////////////////////////////////////////////
			
 
				+// TODO: THREAD ME
			
 
				+/////////////////////////////////////////////////
			
 
				+//Compute fluxes in the x-direction for each cell
			
 
				+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
			
 
				+  for (k = 0; k < nz + 1; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+    {
			
 
				+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        for (s = 0; s < sten_size; s++)
			
 
				+        {
			
 
				+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
			
 
				+          stencil[s] = state[inds];
			
 
				+        }
			
 
				+        //Fourth-order-accurate interpolation of the state
			
 
				+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
			
 
				+        //First-order-accurate interpolation of the third spatial derivative of the state
			
 
				+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
			
 
				+      }
			
 
				+
			
 
				+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
			
 
				+      r = vals[ID_DENS] + hy_dens_int[k];
			
 
				+      u = vals[ID_UMOM] / r;
			
 
				+      w = vals[ID_WMOM] / r;
			
 
				+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
			
 
				+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
			
 
				+
			
 
				+      //Compute the flux vector with hyperviscosity
			
 
				+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
			
 
				+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
			
 
				+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
			
 
				+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+/////////////////////////////////////////////////
			
 
				+// TODO: THREAD ME
			
 
				+/////////////////////////////////////////////////
			
 
				+//Use the fluxes to compute tendencies for each cell
			
 
				+#pragma acc parallel loop private(indt, indf1, indf2)
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+        indt = ll * nz * nx + k * nx + i;
			
 
				+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
			
 
				+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
			
 
				+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
			
 
				+        if (ll == ID_WMOM)
			
 
				+        {
			
 
				+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+          tend[indt] = tend[indt] - state[inds] * grav;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void set_halo_values_x(double *state)
			
 
				+{
			
 
				+  int k, ll, ind_r, ind_u, ind_t, i;
			
 
				+  double z;
			
 
				+
			
 
				+#pragma acc parallel loop
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
			
 
				+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
			
 
				+    }
			
 
				+  }
			
 
				+  ////////////////////////////////////////////////////
			
 
				+
			
 
				+  if (myrank == 0)
			
 
				+  {
			
 
				+    for (k = 0; k < nz; k++)
			
 
				+    {
			
 
				+      for (i = 0; i < hs; i++)
			
 
				+      {
			
 
				+        z = (k_beg + k + 0.5) * dz;
			
 
				+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
			
 
				+        {
			
 
				+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
			
 
				+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
			
 
				+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//Set this task's halo values in the z-direction.
			
 
				+//decomposition in the vertical direction.
			
 
				+void set_halo_values_z(double *state)
			
 
				+{
			
 
				+  int i, ll;
			
 
				+  const double mnt_width = xlen / 8;
			
 
				+  double x, xloc, mnt_deriv;
			
 
				+/////////////////////////////////////////////////
			
 
				+// TODO: THREAD ME
			
 
				+/////////////////////////////////////////////////
			
 
				+#pragma acc parallel loop private(x, xloc, mnt_deriv)
			
 
				+  for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 2 * hs; i++)
			
 
				+    {
			
 
				+      if (ll == ID_WMOM)
			
 
				+      {
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
			
 
				+      }
			
 
				+      else
			
 
				+      {
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
			
 
				+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void init()
			
 
				+{
			
 
				+  int i, k, ii, kk, ll, inds, i_end;
			
 
				+  double x, z, r, u, w, t, hr, ht, nper;
			
 
				+
			
 
				+  //Set the cell grid size
			
 
				+  dx = xlen / nx_glob;
			
 
				+  dz = zlen / nz_glob;
			
 
				+
			
 
				+  nranks = 1;
			
 
				+  myrank = 0;
			
 
				+
			
 
				+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
			
 
				+
			
 
				+  nper = ((double)nx_glob) / nranks;
			
 
				+  i_beg = round(nper * (myrank));
			
 
				+  i_end = round(nper * ((myrank) + 1)) - 1;
			
 
				+  nx = i_end - i_beg + 1;
			
 
				+  left_rank = myrank - 1;
			
 
				+  if (left_rank == -1)
			
 
				+    left_rank = nranks - 1;
			
 
				+  right_rank = myrank + 1;
			
 
				+  if (right_rank == nranks)
			
 
				+    right_rank = 0;
			
 
				+
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+  ////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+  k_beg = 0;
			
 
				+  nz = nz_glob;
			
 
				+
			
 
				+  //Allocate the model data
			
 
				+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
			
 
				+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
			
 
				+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
			
 
				+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
			
 
				+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
			
 
				+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
			
 
				+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
			
 
				+
			
 
				+  //Define the maximum stable time step based on an assumed maximum wind speed
			
 
				+  dt = dmin(dx, dz) / max_speed * cfl;
			
 
				+  //Set initial elapsed model time and output_counter to zero
			
 
				+  etime = 0.;
			
 
				+  output_counter = 0.;
			
 
				+
			
 
				+  // Display grid information
			
 
				+
			
 
				+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
			
 
				+  printf("dx,dz: %lf %lf\n", dx, dz);
			
 
				+  printf("dt: %lf\n", dt);
			
 
				+
			
 
				+  //////////////////////////////////////////////////////////////////////////
			
 
				+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
			
 
				+  //////////////////////////////////////////////////////////////////////////
			
 
				+  for (k = 0; k < nz + 2 * hs; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx + 2 * hs; i++)
			
 
				+    {
			
 
				+      //Initialize the state to zero
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+        state[inds] = 0.;
			
 
				+      }
			
 
				+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
			
 
				+      for (kk = 0; kk < nqpoints; kk++)
			
 
				+      {
			
 
				+        for (ii = 0; ii < nqpoints; ii++)
			
 
				+        {
			
 
				+          //Compute the x,z location within the global domain based on cell and quadrature index
			
 
				+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
			
 
				+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
			
 
				+
			
 
				+          //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+          injection(x, z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+          //Store into the fluid state array
			
 
				+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
			
 
				+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
			
 
				+        }
			
 
				+      }
			
 
				+      for (ll = 0; ll < NUM_VARS; ll++)
			
 
				+      {
			
 
				+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
			
 
				+        state_tmp[inds] = state[inds];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  //Compute the hydrostatic background state over vertical cell averages
			
 
				+  for (k = 0; k < nz + 2 * hs; k++)
			
 
				+  {
			
 
				+    hy_dens_cell[k] = 0.;
			
 
				+    hy_dens_theta_cell[k] = 0.;
			
 
				+    for (kk = 0; kk < nqpoints; kk++)
			
 
				+    {
			
 
				+      z = (k_beg + k - hs + 0.5) * dz;
			
 
				+
			
 
				+      //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+      injection(0., z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
			
 
				+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
			
 
				+    }
			
 
				+  }
			
 
				+  //Compute the hydrostatic background state at vertical cell interfaces
			
 
				+  for (k = 0; k < nz + 1; k++)
			
 
				+  {
			
 
				+    z = (k_beg + k) * dz;
			
 
				+
			
 
				+    //Set the fluid state based on the user's specification (default is injection in this example)
			
 
				+    injection(0., z, r, u, w, t, hr, ht);
			
 
				+
			
 
				+    hy_dens_int[k] = hr;
			
 
				+    hy_dens_theta_int[k] = hr * ht;
			
 
				+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
			
 
				+//x and z are input coordinates at which to sample
			
 
				+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
			
 
				+//hr and ht are output background hydrostatic density and potential temperature at that location
			
 
				+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
			
 
				+{
			
 
				+  hydro_const_theta(z, hr, ht);
			
 
				+  r = 0.;
			
 
				+  t = 0.;
			
 
				+  u = 0.;
			
 
				+  w = 0.;
			
 
				+}
			
 
				+
			
 
				+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
			
 
				+//z is the input coordinate
			
 
				+//r and t are the output background hydrostatic density and potential temperature
			
 
				+void hydro_const_theta(double z, double &r, double &t)
			
 
				+{
			
 
				+  const double theta0 = 300.; //Background potential temperature
			
 
				+  const double exner0 = 1.;   //Surface-level Exner pressure
			
 
				+  double p, exner, rt;
			
 
				+  //Establish hydrostatic balance first using Exner pressure
			
 
				+  t = theta0;                                //Potential Temperature at z
			
 
				+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
			
 
				+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
			
 
				+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
			
 
				+  r = rt / t;                                //Density at z
			
 
				+}
			
 
				+
			
 
				+//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime)
			
 
				+//The file I/O uses netcdf, the only external library required for this mini-app.
			
 
				+//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
			
 
				+void output(double *state, double etime)
			
 
				+{
			
 
				+  int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
			
 
				+  int i, k, ind_r, ind_u, ind_w, ind_t;
			
 
				+
			
 
				+  size_t st1[1], ct1[1], st3[3], ct3[3];
			
 
				+
			
 
				+  //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
			
 
				+  double *dens, *uwnd, *wwnd, *theta;
			
 
				+  double *etimearr;
			
 
				+  //Inform the user
			
 
				+
			
 
				+  printf("*** OUTPUT ***\n");
			
 
				+
			
 
				+  //Allocate some (big) temp arrays
			
 
				+  dens = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  uwnd = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  wwnd = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  theta = (double *)malloc(nx * nz * sizeof(double));
			
 
				+  etimearr = (double *)malloc(1 * sizeof(double));
			
 
				+
			
 
				+  //If the elapsed time is zero, create the file. Otherwise, open the file
			
 
				+  if (etime == 0)
			
 
				+  {
			
 
				+    //Create the file
			
 
				+    ncwrap(nc_create("new.nc", NC_CLOBBER, &ncid), __LINE__);
			
 
				+
			
 
				+    //Create the dimensions
			
 
				+    ncwrap(nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid), __LINE__);
			
 
				+    ncwrap(nc_def_dim(ncid, "x", nx_glob, &x_dimid), __LINE__);
			
 
				+    ncwrap(nc_def_dim(ncid, "z", nz_glob, &z_dimid), __LINE__);
			
 
				+
			
 
				+    //Create the variables
			
 
				+    dimids[0] = t_dimid;
			
 
				+    ncwrap(nc_def_var(ncid, "t", NC_DOUBLE, 1, dimids, &t_varid), __LINE__);
			
 
				+
			
 
				+    dimids[0] = t_dimid;
			
 
				+    dimids[1] = z_dimid;
			
 
				+    dimids[2] = x_dimid;
			
 
				+
			
 
				+    ncwrap(nc_def_var(ncid, "dens", NC_DOUBLE, 3, dimids, &dens_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "uwnd", NC_DOUBLE, 3, dimids, &uwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "wwnd", NC_DOUBLE, 3, dimids, &wwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_def_var(ncid, "theta", NC_DOUBLE, 3, dimids, &theta_varid), __LINE__);
			
 
				+
			
 
				+    //End "define" mode
			
 
				+    ncwrap(nc_enddef(ncid), __LINE__);
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+    //Open the file
			
 
				+    ncwrap(nc_open("new.nc", NC_WRITE, &ncid), __LINE__);
			
 
				+
			
 
				+    //Get the variable IDs
			
 
				+    ncwrap(nc_inq_varid(ncid, "dens", &dens_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "uwnd", &uwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "wwnd", &wwnd_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "theta", &theta_varid), __LINE__);
			
 
				+    ncwrap(nc_inq_varid(ncid, "t", &t_varid), __LINE__);
			
 
				+  }
			
 
				+
			
 
				+  //Store perturbed values in the temp arrays for output
			
 
				+  for (k = 0; k < nz; k++)
			
 
				+  {
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+    {
			
 
				+      ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_w = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
			
 
				+      dens[k * nx + i] = state[ind_r];
			
 
				+      uwnd[k * nx + i] = state[ind_u] / (hy_dens_cell[k + hs] + state[ind_r]);
			
 
				+      wwnd[k * nx + i] = state[ind_w] / (hy_dens_cell[k + hs] + state[ind_r]);
			
 
				+      theta[k * nx + i] = (state[ind_t] + hy_dens_theta_cell[k + hs]) / (hy_dens_cell[k + hs] + state[ind_r]) - hy_dens_theta_cell[k + hs] / hy_dens_cell[k + hs];
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  //Write the grid data to file with all the processes writing collectively
			
 
				+  st3[0] = num_out;
			
 
				+  st3[1] = k_beg;
			
 
				+  st3[2] = i_beg;
			
 
				+  ct3[0] = 1;
			
 
				+  ct3[1] = nz;
			
 
				+  ct3[2] = nx;
			
 
				+
			
 
				+  ncwrap(nc_put_vara_double(ncid, dens_varid, st3, ct3, dens), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, uwnd_varid, st3, ct3, uwnd), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, wwnd_varid, st3, ct3, wwnd), __LINE__);
			
 
				+  ncwrap(nc_put_vara_double(ncid, theta_varid, st3, ct3, theta), __LINE__);
			
 
				+
			
 
				+  //Only the master process needs to write the elapsed time
			
 
				+  //write elapsed time to file
			
 
				+
			
 
				+  st1[0] = num_out;
			
 
				+  ct1[0] = 1;
			
 
				+  etimearr[0] = etime;
			
 
				+  ncwrap(nc_put_vara_double(ncid, t_varid, st1, ct1, etimearr), __LINE__);
			
 
				+
			
 
				+  //Close the file
			
 
				+  ncwrap(nc_close(ncid), __LINE__);
			
 
				+
			
 
				+  //Increment the number of outputs
			
 
				+  num_out = num_out + 1;
			
 
				+
			
 
				+  //Deallocate the temp arrays
			
 
				+  free(dens);
			
 
				+  free(uwnd);
			
 
				+  free(wwnd);
			
 
				+  free(theta);
			
 
				+  free(etimearr);
			
 
				+}
			
 
				+
			
 
				+//Error reporting routine for the NetCDF I/O
			
 
				+void ncwrap(int ierr, int line)
			
 
				+{
			
 
				+  if (ierr != NC_NOERR)
			
 
				+  {
			
 
				+    printf("NetCDF Error at line: %d\n", line);
			
 
				+    printf("%s\n", nc_strerror(ierr));
			
 
				+    exit(-1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void finalize()
			
 
				+{
			
 
				+  free(state);
			
 
				+  free(state_tmp);
			
 
				+  free(flux);
			
 
				+  free(tend);
			
 
				+  free(hy_dens_cell);
			
 
				+  free(hy_dens_theta_cell);
			
 
				+  free(hy_dens_int);
			
 
				+  free(hy_dens_theta_int);
			
 
				+  free(hy_pressure_int);
			
 
				+}
			
--- a/hpc/miniprofiler/English/C/source_code/lab3/.ipynb_checkpoints/profiling-c-lab3-checkpoint.ipynb
+++ b/hpc/miniprofiler/English/C/source_code/lab3/.ipynb_checkpoints/profiling-c-lab3-checkpoint.ipynb