瀏覽代碼

Merge pull request #1 from mozhgan-kch/hpc-bootcamp

jupyter template,mini profiler,openacc added
Mozhgan Kabiri Chimeh 4 年之前
父節點
當前提交
129968dd95
共有 100 個文件被更改,包括 5554 次插入0 次删除
  1. 2 0
      .gitignore
  2. 60 0
      hpc/miniprofiler/Dockerfile
  3. 23 0
      hpc/miniprofiler/English/C/LICENSE
  4. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/3launch5skip.png
  5. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/MPI_Division.jpg
  6. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Nsight Diagram.png
  7. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Optimization_Cycle.jpg
  8. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Outer_Loop.jpg
  9. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Range-Kutta.jpg
  10. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Screenshot from 2020-04-15 10-25-49.png
  11. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete.jpg
  12. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete_Step.jpg
  13. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Time.jpg
  14. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/Time_Step.jpg
  15. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/X_Y.jpg
  16. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/allsection-compute.png
  17. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/baseline-compute.png
  18. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/c2compute.png
  19. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ccompute.png
  20. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cexer5.png
  21. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1-2.png
  22. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1.png
  23. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback2.png
  24. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3-1.png
  25. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3.png
  26. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback4.png
  27. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/charts-compute.png
  28. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/checkerpy.png
  29. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cli-out.png
  30. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/compute-open.png
  31. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/compute.png
  32. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cpu.png
  33. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/cuda.png
  34. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/diagram.png
  35. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx.png
  36. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_gui.png
  37. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_terminal.png
  38. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/f2compute.png
  39. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/fcompute.png
  40. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-0.png
  41. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-1.png
  42. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1.png
  43. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback2.png
  44. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback3.png
  45. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback4.png
  46. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/fortran_nvtx.png
  47. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/fortranexer5.png
  48. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/fulllaunch.png
  49. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/laplas3.png
  50. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/launch-compute.png
  51. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/list-set.png
  52. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open - Copy.png
  53. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open.png
  54. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_data_mv.png
  55. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_fast_mv.png
  56. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow.png
  57. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow_mv.png
  58. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/nvtx.PNG
  59. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/occu-1.png
  60. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/occu-2.png
  61. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/occu-3.png
  62. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/page-compute.png
  63. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q1-1.png
  64. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q1-2.png
  65. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1.png
  66. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1_zoom.png
  67. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q2-2_zoom.png
  68. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q3-1.png
  69. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q3-2.png
  70. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1.png
  71. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom.png
  72. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom2.png
  73. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2.PNG
  74. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom.png
  75. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom2.png
  76. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/rule-compute.png
  77. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/sections-compute.png
  78. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/summary-compute.png
  79. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/thread.png
  80. 二進制
      hpc/miniprofiler/English/C/jupyter_notebook/images/triangle.png
  81. 114 0
      hpc/miniprofiler/English/C/jupyter_notebook/miniweather.ipynb
  82. 199 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab1.ipynb
  83. 184 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab2.ipynb
  84. 249 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab3.ipynb
  85. 187 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab4.ipynb
  86. 368 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab5.ipynb
  87. 227 0
      hpc/miniprofiler/English/C/jupyter_notebook/profiling-c.ipynb
  88. 12 0
      hpc/miniprofiler/English/C/source_code/lab1/Makefile
  89. 641 0
      hpc/miniprofiler/English/C/source_code/lab1/miniWeather_serial.cpp
  90. 11 0
      hpc/miniprofiler/English/C/source_code/lab2/Makefile
  91. 645 0
      hpc/miniprofiler/English/C/source_code/lab2/miniWeather_openacc.cpp
  92. 11 0
      hpc/miniprofiler/English/C/source_code/lab3/Makefile
  93. 645 0
      hpc/miniprofiler/English/C/source_code/lab3/miniWeather_openacc.cpp
  94. 11 0
      hpc/miniprofiler/English/C/source_code/lab4/Makefile
  95. 645 0
      hpc/miniprofiler/English/C/source_code/lab4/miniWeather_openacc.cpp
  96. 11 0
      hpc/miniprofiler/English/C/source_code/lab5/Makefile
  97. 649 0
      hpc/miniprofiler/English/C/source_code/lab5/miniWeather_openacc.cpp
  98. 11 0
      hpc/miniprofiler/English/C/source_code/solutions/Makefile
  99. 649 0
      hpc/miniprofiler/English/C/source_code/solutions/miniWeather_openacc.cpp
  100. 0 0
      hpc/miniprofiler/English/C/source_code/solutions/miniWeather_openacc_exr2.cpp

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*

+ 60 - 0
hpc/miniprofiler/Dockerfile

@@ -0,0 +1,60 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+# To build: $ sudo docker build -t nvidia_nsight_profiling_openacc:latest .
+# To run: $ sudo docker run --rm -it --gpus=all -p 8888:8888 nvidia_nsight_profiling_openacc:latest
+# Finally, open http://127.0.0.1:8888/
+
+FROM nvcr.io/hpc/pgi-compilers:ce
+
+RUN apt-get update -y && \
+    apt-get dist-upgrade -y && \
+    apt-get install --no-install-recommends -y \
+    openssh-server m4 vim-nox emacs-nox nano zip && \
+    rm -rf /var/lib/apt/cache/* 
+
+RUN apt-get update 
+RUN apt-get install --no-install-recommends -y python3 python3-pip nginx 
+RUN pip3 install --upgrade pip
+RUN apt-get install --no-install-recommends -y python3-setuptools
+RUN apt-get install --no-install-recommends -y git-core
+RUN apt-get install --no-install-recommends -y openssh-client 
+RUN apt-get install --no-install-recommends -y inotify-tools 
+RUN pip3 install jupyter
+RUN pip3 install netcdf4
+RUN apt-get install --no-install-recommends -y curl
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
+RUN apt-get install --no-install-recommends -y git-lfs
+RUN git lfs install
+
+# NVIDIA Nsight Systems 
+RUN apt-get update -y && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    wget && \
+    echo "deb https://developer.download.nvidia.com/devtools/repo-deb/x86_64/ /" >> /etc/apt/sources.list.d/nsight.list && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" >> /etc/apt/sources.list.d/nsight.list  &&\
+    apt-get update -y
+
+RUN apt-get update -y
+
+# NVIDIA nsight-systems-2020.2.1 nsight-compute-2020.1.1
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nsight-systems-2020.2.1 nsight-compute-2020.1.1
+
+RUN apt-get install --no-install-recommends -y build-essential
+
+
+ENV PATH="$PATH:/usr/local/bin:/opt/anaconda3/bin:/usr/bin" LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib" LIBRARY_PATH="$LIBRARY_PATH:/usr/local/lib" CPATH="$CPATH:/usr/local/include"
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda3  && \
+    rm Miniconda3-latest-Linux-x86_64.sh && \
+    /opt/anaconda3/bin/conda install -y -q netcdf4
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/pgi/linux86-64-llvm/2019/cuda/10.1/lib64/" 
+ENV PATH="$PATH:/opt/nvidia/nsight-systems/2020.2.1/bin:/opt/nvidia/nsight-compute/2020.1.1/"
+
+ADD English/ /labs
+WORKDIR /labs
+CMD service nginx start && jupyter notebook --no-browser --allow-root --ip=0.0.0.0 --port=8888 --NotebookApp.token="" --notebook-dir=/labs

+ 23 - 0
hpc/miniprofiler/English/C/LICENSE

@@ -0,0 +1,23 @@
+Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/3launch5skip.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/MPI_Division.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Nsight Diagram.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Optimization_Cycle.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Outer_Loop.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Range-Kutta.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Screenshot from 2020-04-15 10-25-49.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Semi_Discrete_Step.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Time.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/Time_Step.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/X_Y.jpg


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/allsection-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/baseline-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/c2compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ccompute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cexer5.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1-2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback3.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cfeedback4.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/charts-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/checkerpy.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cli-out.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/compute-open.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cpu.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/cuda.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/diagram.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_gui.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/e1-nvtx_terminal.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/f2compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/fcompute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-0.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback3.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/ffeedback4.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/fortran_nvtx.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/fortranexer5.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/fulllaunch.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/laplas3.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/launch-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/list-set.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open - Copy.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsight_open.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_data_mv.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_fast_mv.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nsys_slow_mv.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/nvtx.PNG


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/occu-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/occu-2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/occu-3.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/page-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q1-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q1-2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q2-1_zoom.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q2-2_zoom.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q3-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q3-2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-1_zoom2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2.PNG


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/q4-2_zoom2.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/rule-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/sections-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/summary-compute.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/thread.png


二進制
hpc/miniprofiler/English/C/jupyter_notebook/images/triangle.png


+ 114 - 0
hpc/miniprofiler/English/C/jupyter_notebook/miniweather.ipynb

@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A MINI-WEATHER APPLICATION\n",
+    "\n",
+    "In this lab we will accelerate a Fluid Simulation in the context of atmosphere and weather simulation.\n",
+    "The mini weather code mimics the basic dynamics seen in the atmspheric weather and climate.\n",
+    "\n",
+    "The figure below demonstrates how a narrow jet of fast and slightly cold wind is injected into a balanced, neutral atmosphere at rest from the left domain near the model.\n",
+    "\n",
+    "<img src=\"images/Time.jpg\" width=\"80%\" height=\"80%\">\n",
+    "\n",
+    "Simulation is a repetitive process from 0 to the desired simulated time, increasing by Δt on every iteration.\n",
+    "Each Δt step is practically the same operation. Each simulation is solving a differential equation that represents how the flow of the atmosphere (fluid) changes according to small perturbations. To simplify this solution the code uses dimensional splitting: Each dimension X and Z are treated independently.\n",
+    "\n",
+    "<img src=\"images/X_Y.jpg\" width=\"80%\" height=\"80%\">\n",
+    "\n",
+    "The differential equation has a time derivative that needs integrating, and a simple low-storage Runge-Kutta ODE solver is used to integrate the time derivative. Each time step, the order in which the dimentions are solved is reversed, giving second-order accuracy. \n",
+    "\n",
+    "<img src=\"images/Range-Kutta.jpg\" width=\"70%\" height=\"70%\">\n",
+    "\n",
+    "### The objective of this exercise is not to dwell into the Maths part of it but to make use of OpenACC to parallelize and improve the performance.\n",
+    "\n",
+    "The general flow of the code is as shown in diagram below. For each time step the differential equations are solved.\n",
+    "\n",
+    "<img src=\"images/Outer_Loop.jpg\" width=\"70%\" height=\"70%\">\n",
+    "\n",
+    "\n",
+    "```cpp\n",
+    "while (etime < sim_time) {\n",
+    "    //If the time step leads to exceeding the simulation time, shorten it for the last step\n",
+    "    if (etime + dt > sim_time) { dt = sim_time - etime; }\n",
+    "    //Perform a single time step\n",
+    "    perform_timestep(state,state_tmp,flux,tend,dt);\n",
+    "    //Inform the user\n",
+    "    if (masterproc) { printf( \"Elapsed Time: %lf / %lf\\n\", etime , sim_time ); }\n",
+    "    //Update the elapsed time and output counter\n",
+    "    etime = etime + dt;\n",
+    "    output_counter = output_counter + dt;\n",
+    "    //If it's time for output, reset the counter, and do output\n",
+    "    if (output_counter >= output_freq) {\n",
+    "      output_counter = output_counter - output_freq;\n",
+    "      output(state,etime);\n",
+    "    }\n",
+    "  }\n",
+    "  \n",
+    "```\n",
+    "\n",
+    "At every time step the direction is reversed to get second order derivative.\n",
+    "\n",
+    "\n",
+    "<img src=\"images/Time_Step.jpg\" width=\"70%\" height=\"70%\">\n",
+    "\n",
+    "```cpp\n",
+    "void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {\n",
+    "  if (direction_switch) {\n",
+    "    //x-direction first\n",
+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
+    "    //z-direction second\n",
+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
+    "  } else {\n",
+    "    //z-direction second\n",
+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , flux , tend );\n",
+    "    //x-direction first\n",
+    "    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend );\n",
+    "    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , flux , tend );\n",
+    "  }\n",
+    "  if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; }\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "<img src=\"images/Semi_Discrete.jpg\" width=\"70%\" height=\"70%\">\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

File diff suppressed because it is too large
+ 199 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab1.ipynb


+ 184 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab2.ipynb

@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this lab, we will optimize the weather simulation application written in C++ (if you prefer to use Fortran, click [this link](../../Fortran/jupyter_notebook/profiling-fortran.ipynb)). \n",
+    "\n",
+    "Let's execute the cell below to display information about the GPUs running on the server by running the pgaccelinfo command, which ships with the PGI compiler that we will be using. To do this, execute the cell block below by giving it focus (clicking on it with your mouse), and hitting Ctrl-Enter, or pressing the play button in the toolbar above. If all goes well, you should see some output returned below the grey cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pgaccelinfo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 2 \n",
+    "\n",
+    "### Learning objectives\n",
+    "Learn how to identify and parallelise the computationally expensive routines in your application using OpenACC compute constructs (A compute construct is a parallel, kernels, or serial construct.). In this exercise you will:\n",
+    "\n",
+    "- Implement OpenACC parallelism using parallel directives to parallelise the serial application\n",
+    "- Learn how to compile your parallel application with PGI compiler\n",
+    "- Benchmark and compare the parallel version of the application with the serial version\n",
+    "- Learn how to interpret PGI compiler feedback to ensure the applied optimization were successful"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the top menu, click on *File*, and *Open* `miniWeather_openacc.cpp` and `Makefile` from the current directory at `C/source_code/lab2` directory and inspect the code before running below cells. We have already added OpenACC compute directives (`#pragma acc parallel`) around the expensive routines (loops) in the code.\n",
+    "\n",
+    "Once done, compile the code with `make`. View the PGI compiler feedback (enabled by adding `-Minfo=accel` flag) and investigate the compiler feedback for the OpenACC code. The compiler feedback provides useful information about applied optimizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../source_code/lab2 && make clean && make"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's inspect part of the compiler feedback and see what it's telling us.\n",
+    "\n",
+    "<img src=\"images/cfeedback1.png\">\n",
+    "\n",
+    "- Using `-ta=tesla:managed`, instruct the compiler to build for an NVIDIA Tesla GPU using \"CUDA Managed Memory\"\n",
+    "- Using `-Minfo` command-line option, we will see all output from the compiler. In this example, we use `-Minfo=accel` to only see the output corresponding to the accelerator (in this case an NVIDIA GPU).\n",
+    "- The first line of the output, `compute_tendencies_x`, tells us which function the following information is in reference to.\n",
+    "- The line starting with 227, shows we created a parallel OpenACC loop. This loop is made up of gangs (a grid of blocks in CUDA language) and vector parallelism (threads in CUDA language) with the vector size being 128 per gang. `277, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */`\n",
+    "- The rest of the information concerns data movement. Compiler detected possible need to move data and handled it for us. We will get into this later in this lab.\n",
+    "\n",
+    "It is very important to inspect the feedback to make sure the compiler is doing what you have asked of it.\n",
+    "\n",
+    "Now, let's **Run** the application for small values of `nx_glob`,`nz_glob`, and `sim_time`: **40, 20, 1000**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd ../source_code/lab2 && nsys profile -t nvtx,openacc --stats=true --force-overwrite true -o miniWeather_3 ./miniWeather 40 20 1000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can see that the changes made actually slowed down the code and it runs slower compared to the non-accelerated CPU only version. Let's checkout the profiler's report. [Download the profiler output](../source_code/lab2/miniWeather_3.qdrep) and open it via the GUI. \n",
+    "\n",
+    "From the \"timeline view\" on the top pane, double click on the \"CUDA\" from the function table on the left and expand it. Zoom in on the timeline and you can see a pattern similar to the screenshot below. The blue boxes are the compute kernels and each of these groupings of kernels is surrounded by purple and teal boxes (annotated with red color) representing data movements. **Screenshots represents profiler report for the values of 400,200,1500.**\n",
+    "\n",
+    "<img src=\"images/nsys_slow.png\" width=\"80%\" height=\"80%\">\n",
+    "\n",
+    "Let's hover your mouse over kernels (blue boxes) one by one from each row and checkout the provided information.\n",
+    "\n",
+    "<img src=\"images/occu-1.png\" width=\"60%\" height=\"60%\">\n",
+    "\n",
+    "**Note**: In the next two exercises, we start optimizing the application by improving the occupancy and reducing data movements."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Post-Lab Summary\n",
+    "\n",
+    "If you would like to download this lab for later viewing, it is recommend you go to your browsers File menu (not the Jupyter notebook file menu) and save the complete web page.  This will ensure the images are copied down as well. You can also execute the following cell block to create a zip-file of the files you've been working on, and download it with the link below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "cd ..\n",
+    "rm -f openacc_profiler_files.zip\n",
+    "zip -r openacc_profiler_files.zip *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**After** executing the above zip command, you should be able to download the zip file [here](../openacc_profiler_files.zip)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----\n",
+    "\n",
+    "# <p style=\"text-align:center;border:3px; border-style:solid; border-color:#FF0000  ; padding: 1em\"> <a href=../../profiling_start.ipynb>HOME</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style=\"float:center\"> <a href=profiling-c-lab3.ipynb>NEXT</a></span> </p>\n",
+    "\n",
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Links and Resources\n",
+    "\n",
+    "[OpenACC API Guide](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "\n",
+    "[NVIDIA Nsight System](https://docs.nvidia.com/nsight-systems/)\n",
+    "\n",
+    "[CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)\n",
+    "\n",
+    "**NOTE**: To be able to see the Nsight System profiler output, please download Nsight System latest version from [here](https://developer.nvidia.com/nsight-systems).\n",
+    "\n",
+    "Don't forget to check out additional [OpenACC Resources](https://www.openacc.org/resources) and join our [OpenACC Slack Channel](https://www.openacc.org/community#slack) to share your experience and get more help from the community.\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "## Licensing \n",
+    "\n",
+    "This material is released by NVIDIA Corporation under the Creative Commons Attribution 4.0 International (CC BY 4.0). "
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

File diff suppressed because it is too large
+ 249 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab3.ipynb


File diff suppressed because it is too large
+ 187 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab4.ipynb


File diff suppressed because it is too large
+ 368 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c-lab5.ipynb


File diff suppressed because it is too large
+ 227 - 0
hpc/miniprofiler/English/C/jupyter_notebook/profiling-c.ipynb


+ 12 - 0
hpc/miniprofiler/English/C/source_code/lab1/Makefile

@@ -0,0 +1,12 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w -ldl
+ACCFLAGS := -Minfo=accel
+NVTXLIB := -I/opt/pgi/linux86-64-llvm/2019/cuda/10.1/include/
+
+miniWeather: miniWeather_serial.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_serial.cpp ${NVTXLIB} 
+
+clean:
+	rm -f *.o miniWeather

+ 641 - 0
hpc/miniprofiler/English/C/source_code/lab1/miniWeather_serial.cpp

@@ -0,0 +1,641 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 1000;   //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+  nvtxRangePushA("Total");
+  init();
+
+  //Output the initial state
+  //output(state, etime);
+
+  ////////////////////////////////////////////////////
+  // MAIN TIME STEP LOOP
+  ////////////////////////////////////////////////////
+
+  nvtxRangePushA("while");
+  while (etime < sim_time)
+  {
+    //If the time step leads to exceeding the simulation time, shorten it for the last step
+    if (etime + dt > sim_time)
+    {
+      dt = sim_time - etime;
+    }
+
+    //Perform a single time step
+    nvtxRangePushA("perform_timestep");
+    perform_timestep(state, state_tmp, flux, tend, dt);
+    nvtxRangePop();
+
+    //Inform the user
+
+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+    //Update the elapsed time and output counter
+    etime = etime + dt;
+    output_counter = output_counter + dt;
+    //If it's time for output, reset the counter, and do output
+
+    if (output_counter >= output_freq)
+    {
+      output_counter = output_counter - output_freq;
+      //output(state, etime);
+    }
+  }
+  nvtxRangePop();
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Apply the tendencies to the fluid state
+
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Use the fluxes to compute tendencies for each cell
+
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Use the fluxes to compute tendencies for each cell
+
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 11 - 0
hpc/miniprofiler/English/C/source_code/lab2/Makefile

@@ -0,0 +1,11 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w
+ACCFLAGS := -ta=tesla:managed -Minfo=accel
+
+miniWeather: miniWeather_openacc.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
+
+clean:
+	rm -f *.o miniWeather

+ 645 - 0
hpc/miniprofiler/English/C/source_code/lab2/miniWeather_openacc.cpp

@@ -0,0 +1,645 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 1000;   //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+
+  nvtxRangePushA("Total");
+  init();
+
+  //Output the initial state
+  //output(state, etime);
+
+  ////////////////////////////////////////////////////
+  // MAIN TIME STEP LOOP
+  ////////////////////////////////////////////////////
+
+  nvtxRangePushA("while");
+  while (etime < sim_time)
+  {
+    //If the time step leads to exceeding the simulation time, shorten it for the last step
+    if (etime + dt > sim_time)
+    {
+      dt = sim_time - etime;
+    }
+
+    //Perform a single time step
+    nvtxRangePushA("perform_timestep");
+    perform_timestep(state, state_tmp, flux, tend, dt);
+    nvtxRangePop();
+
+    //Inform the user
+
+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+    //Update the elapsed time and output counter
+    etime = etime + dt;
+    output_counter = output_counter + dt;
+    //If it's time for output, reset the counter, and do output
+
+    if (output_counter >= output_freq)
+    {
+      output_counter = output_counter - output_freq;
+
+      //output(state, etime);
+    }
+  }
+  nvtxRangePop();
+
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Apply the tendencies to the fluid state
+#pragma acc parallel loop private(inds, indt) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+#pragma acc parallel loop
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+#pragma acc parallel loop private(x, xloc, mnt_deriv)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 11 - 0
hpc/miniprofiler/English/C/source_code/lab3/Makefile

@@ -0,0 +1,11 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w
+ACCFLAGS := -ta=tesla:managed -Minfo=accel
+
+miniWeather: miniWeather_openacc.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
+
+clean:
+	rm -f *.o miniWeather

+ 645 - 0
hpc/miniprofiler/English/C/source_code/lab3/miniWeather_openacc.cpp

@@ -0,0 +1,645 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 1000;   //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+
+  nvtxRangePushA("Total");
+  init();
+
+  //Output the initial state
+  //output(state, etime);
+
+  ////////////////////////////////////////////////////
+  // MAIN TIME STEP LOOP
+  ////////////////////////////////////////////////////
+
+  nvtxRangePushA("while");
+  while (etime < sim_time)
+  {
+    //If the time step leads to exceeding the simulation time, shorten it for the last step
+    if (etime + dt > sim_time)
+    {
+      dt = sim_time - etime;
+    }
+
+    //Perform a single time step
+    nvtxRangePushA("perform_timestep");
+    perform_timestep(state, state_tmp, flux, tend, dt);
+    nvtxRangePop();
+
+    //Inform the user
+
+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+    //Update the elapsed time and output counter
+    etime = etime + dt;
+    output_counter = output_counter + dt;
+    //If it's time for output, reset the counter, and do output
+
+    if (output_counter >= output_freq)
+    {
+      output_counter = output_counter - output_freq;
+
+      //output(state, etime);
+    }
+  }
+  nvtxRangePop();
+
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Apply the tendencies to the fluid state
+#pragma acc parallel loop private(inds, indt) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+#pragma acc parallel loop
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+#pragma acc parallel loop private(x, xloc, mnt_deriv)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 11 - 0
hpc/miniprofiler/English/C/source_code/lab4/Makefile

@@ -0,0 +1,11 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w
+ACCFLAGS := -ta=tesla:managed -Minfo=accel
+
+miniWeather: miniWeather_openacc.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
+
+clean:
+	rm -f *.o miniWeather

+ 645 - 0
hpc/miniprofiler/English/C/source_code/lab4/miniWeather_openacc.cpp

@@ -0,0 +1,645 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 1000;   //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+
+  nvtxRangePushA("Total");
+  init();
+
+  //Output the initial state
+  //output(state, etime);
+
+  ////////////////////////////////////////////////////
+  // MAIN TIME STEP LOOP
+  ////////////////////////////////////////////////////
+
+  nvtxRangePushA("while");
+  while (etime < sim_time)
+  {
+    //If the time step leads to exceeding the simulation time, shorten it for the last step
+    if (etime + dt > sim_time)
+    {
+      dt = sim_time - etime;
+    }
+
+    //Perform a single time step
+    nvtxRangePushA("perform_timestep");
+    perform_timestep(state, state_tmp, flux, tend, dt);
+    nvtxRangePop();
+
+    //Inform the user
+
+    printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+    //Update the elapsed time and output counter
+    etime = etime + dt;
+    output_counter = output_counter + dt;
+    //If it's time for output, reset the counter, and do output
+
+    if (output_counter >= output_freq)
+    {
+      output_counter = output_counter - output_freq;
+
+      //output(state, etime);
+    }
+  }
+  nvtxRangePop();
+
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Apply the tendencies to the fluid state
+#pragma acc parallel loop private(inds, indt) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p)
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop private(indt, indf1, indf2)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+#pragma acc parallel loop
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+#pragma acc parallel loop private(x, xloc, mnt_deriv)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 11 - 0
hpc/miniprofiler/English/C/source_code/lab5/Makefile

@@ -0,0 +1,11 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w
+ACCFLAGS := -ta=tesla:managed -Minfo=accel
+
+miniWeather: miniWeather_openacc.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
+
+clean:
+	rm -f *.o miniWeather

+ 649 - 0
hpc/miniprofiler/English/C/source_code/lab5/miniWeather_openacc.cpp

@@ -0,0 +1,649 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 10;     //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+
+  nvtxRangePushA("Total");
+  init();
+
+#pragma acc data copyin(state_tmp[(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS], hy_dens_cell[nz + 2 * hs], hy_dens_theta_cell[nz + 2 * hs], hy_dens_int[nz + 1], hy_dens_theta_int[nz + 1], hy_pressure_int[nz + 1]) \
+    create(flux[(nz + 1) * (nx + 1) * NUM_VARS], tend[nz * nx * NUM_VARS])                                                                                                                                        \
+        copy(state [0:(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS])
+  {
+    //Output the initial state
+    //output(state, etime);
+
+    ////////////////////////////////////////////////////
+    // MAIN TIME STEP LOOP
+    ////////////////////////////////////////////////////
+
+    nvtxRangePushA("while");
+    while (etime < sim_time)
+    {
+      //If the time step leads to exceeding the simulation time, shorten it for the last step
+      if (etime + dt > sim_time)
+      {
+        dt = sim_time - etime;
+      }
+
+      //Perform a single time step
+      nvtxRangePushA("perform_timestep");
+      perform_timestep(state, state_tmp, flux, tend, dt);
+      nvtxRangePop();
+
+      //Inform the user
+
+      printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+      //Update the elapsed time and output counter
+      etime = etime + dt;
+      output_counter = output_counter + dt;
+      //If it's time for output, reset the counter, and do output
+
+      if (output_counter >= output_freq)
+      {
+        output_counter = output_counter - output_freq;
+#pragma acc update host(state[(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS])
+        //output(state, etime);
+      }
+    }
+    nvtxRangePop();
+  }
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Apply the tendencies to the fluid state
+#pragma acc parallel loop collapse(3) private(inds, indt) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop collapse(2) private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p) default(present)
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop collapse(3) private(indt, indf1, indf2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop collapse(2) private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p) default(present)
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop collapse(3) private(indt, indf1, indf2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+#pragma acc parallel loop collapse(2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+#pragma acc parallel loop private(x, xloc, mnt_deriv) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 11 - 0
hpc/miniprofiler/English/C/source_code/solutions/Makefile

@@ -0,0 +1,11 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved. 
+
+CC := pgc++
+CFLAGS := -O3 -w
+ACCFLAGS := -ta=tesla:managed -Minfo=accel
+
+miniWeather: miniWeather_openacc.cpp
+	${CC} ${CFLAGS} ${ACCFLAGS} -o miniWeather miniWeather_openacc.cpp 
+
+clean:
+	rm -f *.o miniWeather

+ 649 - 0
hpc/miniprofiler/English/C/source_code/solutions/miniWeather_openacc.cpp

@@ -0,0 +1,649 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// miniWeather
+// Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+// For documentation, please see the attached documentation in the "documentation" folder
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+** Copyright (c) 2018, National Center for Computational Sciences, Oak Ridge National Laboratory.  All rights reserved.
+**
+** Portions Copyright (c) 2020, NVIDIA Corporation.  All rights reserved.
+*/
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <nvtx3/nvToolsExt.h>
+
+const double pi = 3.14159265358979323846264338327;   //Pi
+const double grav = 9.8;                             //Gravitational acceleration (m / s^2)
+const double cp = 1004.;                             //Specific heat of dry air at constant pressure
+const double rd = 287.;                              //Dry air constant for equation of state (P=rho*rd*T)
+const double p0 = 1.e5;                              //Standard pressure at the surface in Pascals
+const double C0 = 27.5629410929725921310572974482;   //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+const double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much)
+//Define domain and stability-related constants
+const double xlen = 2.e4;     //Length of the domain in the x-direction (meters)
+const double zlen = 1.e4;     //Length of the domain in the z-direction (meters)
+const double hv_beta = 0.25;  //How strong to diffuse the solution: hv_beta \in [0:1]
+const double cfl = 1.50;      //"Courant, Friedrichs, Lewy" number (for numerical stability)
+const double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
+const int hs = 2;             //"Halo" size: number of cells needed for a full "stencil" of information for reconstruction
+const int sten_size = 4;      //Size of the stencil used for interpolation
+
+//Parameters for indexing and flags
+const int NUM_VARS = 4; //Number of fluid state variables
+const int ID_DENS = 0;  //index for density ("rho")
+const int ID_UMOM = 1;  //index for momentum in the x-direction ("rho * u")
+const int ID_WMOM = 2;  //index for momentum in the z-direction ("rho * w")
+const int ID_RHOT = 3;  //index for density * potential temperature ("rho * theta")
+const int DIR_X = 1;    //Integer constant to express that this operation is in the x-direction
+const int DIR_Z = 2;    //Integer constant to express that this operation is in the z-direction
+
+const int nqpoints = 3;
+double qpoints[] = {0.112701665379258311482073460022E0, 0.500000000000000000000000000000E0, 0.887298334620741688517926539980E0};
+double qweights[] = {0.277777777777777777777777777779E0, 0.444444444444444444444444444444E0, 0.277777777777777777777777777779E0};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double sim_time;            //total simulation time in seconds
+double output_freq;         //frequency to perform output in seconds
+double dt;                  //Model time step (seconds)
+int nx, nz;                 //Number of local grid cells in the x- and z- dimensions
+double dx, dz;              //Grid space length in x- and z-dimension (meters)
+int nx_glob, nz_glob;       //Number of total grid cells in the x- and z- dimensions
+int i_beg, k_beg;           //beginning index in the x- and z-directions
+int nranks, myrank;         //my rank id
+int left_rank, right_rank;  //Rank IDs that exist to my left and right in the global domain
+double *hy_dens_cell;       //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+double *hy_dens_int;        //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+double *hy_dens_theta_int;  //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+double *hy_pressure_int;    //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are dynamics over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+double etime;          //Elapsed model time
+double output_counter; //Helps determine when it's time to do output
+//Runtime variable arrays
+double *state;     //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *state_tmp; //Fluid state.             Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS)
+double *flux;      //Cell interface fluxes.   Dimensions: (nx+1,nz+1,NUM_VARS)
+double *tend;      //Fluid state tendencies.  Dimensions: (nx,nz,NUM_VARS)
+int num_out = 0;   //The number of outputs performed so far
+int direction_switch = 1;
+
+//How is this not in the standard?!
+double dmin(double a, double b)
+{
+  if (a < b)
+  {
+    return a;
+  }
+  else
+  {
+    return b;
+  }
+};
+
+//Declaring the functions defined after "main"
+void init();
+void finalize();
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht);
+void hydro_const_theta(double z, double &r, double &t);
+void output(double *state, double etime);
+void ncwrap(int ierr, int line);
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt);
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend);
+void compute_tendencies_x(double *state, double *flux, double *tend);
+void compute_tendencies_z(double *state, double *flux, double *tend);
+void set_halo_values_x(double *state);
+void set_halo_values_z(double *state);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // BEGIN USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //The x-direction length is twice as long as the z-direction length
+  //So, you'll want to have nx_glob be twice as large as nz_glob
+  nx_glob = 40;      //Number of total cells in the x-dirction
+  nz_glob = 20;      //Number of total cells in the z-dirction
+  sim_time = 1000;   //How many seconds to run the simulation
+  output_freq = 100; //How frequently to output data to file (in seconds)
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // END USER-CONFIGURABLE PARAMETERS
+  ///////////////////////////////////////////////////////////////////////////////////////
+
+  if (argc == 4)
+  {
+    printf("The arguments supplied are %s %s %s\n", argv[1], argv[2], argv[3]);
+    nx_glob = atoi(argv[1]);
+    nz_glob = atoi(argv[2]);
+    sim_time = atoi(argv[3]);
+  }
+  else
+  {
+    printf("Using default values ...\n");
+  }
+
+  nvtxRangePushA("Total");
+  init();
+
+#pragma acc data copyin(state_tmp[(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS], hy_dens_cell[nz + 2 * hs], hy_dens_theta_cell[nz + 2 * hs], hy_dens_int[nz + 1], hy_dens_theta_int[nz + 1], hy_pressure_int[nz + 1]) \
+    create(flux[(nz + 1) * (nx + 1) * NUM_VARS], tend[nz * nx * NUM_VARS])                                                                                                                                        \
+        copy(state [0:(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS])
+  {
+    //Output the initial state
+    //output(state, etime);
+
+    ////////////////////////////////////////////////////
+    // MAIN TIME STEP LOOP
+    ////////////////////////////////////////////////////
+
+    nvtxRangePushA("while");
+    while (etime < sim_time)
+    {
+      //If the time step leads to exceeding the simulation time, shorten it for the last step
+      if (etime + dt > sim_time)
+      {
+        dt = sim_time - etime;
+      }
+
+      //Perform a single time step
+      nvtxRangePushA("perform_timestep");
+      perform_timestep(state, state_tmp, flux, tend, dt);
+      nvtxRangePop();
+
+      //Inform the user
+
+      printf("Elapsed Time: %lf / %lf\n", etime, sim_time);
+
+      //Update the elapsed time and output counter
+      etime = etime + dt;
+      output_counter = output_counter + dt;
+      //If it's time for output, reset the counter, and do output
+
+      if (output_counter >= output_freq)
+      {
+        output_counter = output_counter - output_freq;
+#pragma acc update host(state[(nz + 2 * hs) * (nx + 2 * hs) * NUM_VARS])
+        //output(state, etime);
+      }
+    }
+    nvtxRangePop();
+  }
+  finalize();
+  nvtxRangePop();
+}
+
+//Performs a single dimensionally split time step using a simple low-storate three-stage Runge-Kutta time integrator
+//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+//order of directions is alternated each time step.
+//The Runge-Kutta method used here is defined as follows:
+// q*     = q[n] + dt/3 * rhs(q[n])
+// q**    = q[n] + dt/2 * rhs(q*  )
+// q[n+1] = q[n] + dt/1 * rhs(q** )
+void perform_timestep(double *state, double *state_tmp, double *flux, double *tend, double dt)
+{
+  if (direction_switch)
+  {
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+  }
+  else
+  {
+    //z-direction second
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_Z, flux, tend);
+    //x-direction first
+    semi_discrete_step(state, state, state_tmp, dt / 3, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend);
+    semi_discrete_step(state, state_tmp, state, dt / 1, DIR_X, flux, tend);
+  }
+  if (direction_switch)
+  {
+    direction_switch = 0;
+  }
+  else
+  {
+    direction_switch = 1;
+  }
+}
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step(double *state_init, double *state_forcing, double *state_out, double dt, int dir, double *flux, double *tend)
+{
+  int i, k, ll, inds, indt;
+  if (dir == DIR_X)
+  {
+    //Set the halo values  in the x-direction
+    set_halo_values_x(state_forcing);
+    //Compute the time tendencies for the fluid state in the x-direction
+    compute_tendencies_x(state_forcing, flux, tend);
+  }
+  else if (dir == DIR_Z)
+  {
+    //Set the halo values  in the z-direction
+    set_halo_values_z(state_forcing);
+    //Compute the time tendencies for the fluid state in the z-direction
+    compute_tendencies_z(state_forcing, flux, tend);
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Apply the tendencies to the fluid state
+#pragma acc parallel loop collapse(3) private(inds, indt) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+        indt = ll * nz * nx + k * nx + i;
+        state_out[inds] = state_init[inds] + dt * tend[indt];
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+  /////////////////////////////////////////////////
+  // TODO: THREAD ME
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop collapse(2) private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p) default(present)
+  for (k = 0; k < nz; k++)
+  {
+    for (i = 0; i < nx + 1; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + s;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[k + hs];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[k + hs]) / r;
+      p = C0 * pow((r * t), gamm);
+
+      //Compute the flux vector
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * u + p - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * w - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * u * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop collapse(3) private(indt, indf1, indf2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + k * (nx + 1) + i + 1;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dx;
+      }
+    }
+  }
+}
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z(double *state, double *flux, double *tend)
+{
+  int i, k, ll, s, inds, indf1, indf2, indt;
+  double r, u, w, t, p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef;
+  //Compute the hyperviscosity coeficient
+  hv_coef = -hv_beta * dx / (16 * dt);
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Compute fluxes in the x-direction for each cell
+#pragma acc parallel loop collapse(2) private(ll, s, inds, stencil, vals, d3_vals, r, u, w, t, p) default(present)
+  for (k = 0; k < nz + 1; k++)
+  {
+    for (i = 0; i < nx; i++)
+    {
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        for (s = 0; s < sten_size; s++)
+        {
+          inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + s) * (nx + 2 * hs) + i + hs;
+          stencil[s] = state[inds];
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0] / 12 + 7 * stencil[1] / 12 + 7 * stencil[2] / 12 - stencil[3] / 12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3 * stencil[1] - 3 * stencil[2] + stencil[3];
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_int[k];
+      u = vals[ID_UMOM] / r;
+      w = vals[ID_WMOM] / r;
+      t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r;
+      p = C0 * pow((r * t), gamm) - hy_pressure_int[k];
+
+      //Compute the flux vector with hyperviscosity
+      flux[ID_DENS * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w - hv_coef * d3_vals[ID_DENS];
+      flux[ID_UMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * u - hv_coef * d3_vals[ID_UMOM];
+      flux[ID_WMOM * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * w + p - hv_coef * d3_vals[ID_WMOM];
+      flux[ID_RHOT * (nz + 1) * (nx + 1) + k * (nx + 1) + i] = r * w * t - hv_coef * d3_vals[ID_RHOT];
+    }
+  }
+
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+//Use the fluxes to compute tendencies for each cell
+#pragma acc parallel loop collapse(3) private(indt, indf1, indf2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < nx; i++)
+      {
+        indt = ll * nz * nx + k * nx + i;
+        indf1 = ll * (nz + 1) * (nx + 1) + (k) * (nx + 1) + i;
+        indf2 = ll * (nz + 1) * (nx + 1) + (k + 1) * (nx + 1) + i;
+        tend[indt] = -(flux[indf2] - flux[indf1]) / dz;
+        if (ll == ID_WMOM)
+        {
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i + hs;
+          tend[indt] = tend[indt] - state[inds] * grav;
+        }
+      }
+    }
+  }
+}
+
+void set_halo_values_x(double *state)
+{
+  int k, ll, ind_r, ind_u, ind_t, i;
+  double z;
+
+#pragma acc parallel loop collapse(2) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 0] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 2];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs - 1];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs];
+      state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + nx + hs + 1] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + hs + 1];
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (myrank == 0)
+  {
+    for (k = 0; k < nz; k++)
+    {
+      for (i = 0; i < hs; i++)
+      {
+        z = (k_beg + k + 0.5) * dz;
+        if (abs(z - 3 * zlen / 4) <= zlen / 16)
+        {
+          ind_r = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_u = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          ind_t = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + (k + hs) * (nx + 2 * hs) + i;
+          state[ind_u] = (state[ind_r] + hy_dens_cell[k + hs]) * 50.;
+          state[ind_t] = (state[ind_r] + hy_dens_cell[k + hs]) * 298. - hy_dens_theta_cell[k + hs];
+        }
+      }
+    }
+  }
+}
+
+//Set this task's halo values in the z-direction.
+//decomposition in the vertical direction.
+void set_halo_values_z(double *state)
+{
+  int i, ll;
+  const double mnt_width = xlen / 8;
+  double x, xloc, mnt_deriv;
+/////////////////////////////////////////////////
+// TODO: THREAD ME
+/////////////////////////////////////////////////
+#pragma acc parallel loop private(x, xloc, mnt_deriv) default(present)
+  for (ll = 0; ll < NUM_VARS; ll++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      if (ll == ID_WMOM)
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = 0.;
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = 0.;
+      }
+      else
+      {
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (0) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (hs) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+        state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs + 1) * (nx + 2 * hs) + i] = state[ll * (nz + 2 * hs) * (nx + 2 * hs) + (nz + hs - 1) * (nx + 2 * hs) + i];
+      }
+    }
+  }
+}
+
+void init()
+{
+  int i, k, ii, kk, ll, inds, i_end;
+  double x, z, r, u, w, t, hr, ht, nper;
+
+  //Set the cell grid size
+  dx = xlen / nx_glob;
+  dz = zlen / nz_glob;
+
+  nranks = 1;
+  myrank = 0;
+
+  // For simpler version, replace i_beg = 0, nx = nx_glob, left_rank = 0, right_rank = 0;
+
+  nper = ((double)nx_glob) / nranks;
+  i_beg = round(nper * (myrank));
+  i_end = round(nper * ((myrank) + 1)) - 1;
+  nx = i_end - i_beg + 1;
+  left_rank = myrank - 1;
+  if (left_rank == -1)
+    left_rank = nranks - 1;
+  right_rank = myrank + 1;
+  if (right_rank == nranks)
+    right_rank = 0;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+  // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE
+  ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
+
+  k_beg = 0;
+  nz = nz_glob;
+
+  //Allocate the model data
+  state = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  state_tmp = (double *)malloc((nx + 2 * hs) * (nz + 2 * hs) * NUM_VARS * sizeof(double));
+  flux = (double *)malloc((nx + 1) * (nz + 1) * NUM_VARS * sizeof(double));
+  tend = (double *)malloc(nx * nz * NUM_VARS * sizeof(double));
+  hy_dens_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_theta_cell = (double *)malloc((nz + 2 * hs) * sizeof(double));
+  hy_dens_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_dens_theta_int = (double *)malloc((nz + 1) * sizeof(double));
+  hy_pressure_int = (double *)malloc((nz + 1) * sizeof(double));
+
+  //Define the maximum stable time step based on an assumed maximum wind speed
+  dt = dmin(dx, dz) / max_speed * cfl;
+  //Set initial elapsed model time and output_counter to zero
+  etime = 0.;
+  output_counter = 0.;
+
+  // Display grid information
+
+  printf("nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+  printf("dx,dz: %lf %lf\n", dx, dz);
+  printf("dt: %lf\n", dt);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  //////////////////////////////////////////////////////////////////////////
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    for (i = 0; i < nx + 2 * hs; i++)
+    {
+      //Initialize the state to zero
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state[inds] = 0.;
+      }
+      //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for (kk = 0; kk < nqpoints; kk++)
+      {
+        for (ii = 0; ii < nqpoints; ii++)
+        {
+          //Compute the x,z location within the global domain based on cell and quadrature index
+          x = (i_beg + i - hs + 0.5) * dx + (qpoints[ii] - 0.5) * dx;
+          z = (k_beg + k - hs + 0.5) * dz + (qpoints[kk] - 0.5) * dz;
+
+          //Set the fluid state based on the user's specification (default is injection in this example)
+          injection(x, z, r, u, w, t, hr, ht);
+
+          //Store into the fluid state array
+          inds = ID_DENS * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + r * qweights[ii] * qweights[kk];
+          inds = ID_UMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * u * qweights[ii] * qweights[kk];
+          inds = ID_WMOM * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + (r + hr) * w * qweights[ii] * qweights[kk];
+          inds = ID_RHOT * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+          state[inds] = state[inds] + ((r + hr) * (t + ht) - hr * ht) * qweights[ii] * qweights[kk];
+        }
+      }
+      for (ll = 0; ll < NUM_VARS; ll++)
+      {
+        inds = ll * (nz + 2 * hs) * (nx + 2 * hs) + k * (nx + 2 * hs) + i;
+        state_tmp[inds] = state[inds];
+      }
+    }
+  }
+  //Compute the hydrostatic background state over vertical cell averages
+  for (k = 0; k < nz + 2 * hs; k++)
+  {
+    hy_dens_cell[k] = 0.;
+    hy_dens_theta_cell[k] = 0.;
+    for (kk = 0; kk < nqpoints; kk++)
+    {
+      z = (k_beg + k - hs + 0.5) * dz;
+
+      //Set the fluid state based on the user's specification (default is injection in this example)
+      injection(0., z, r, u, w, t, hr, ht);
+
+      hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr * ht * qweights[kk];
+    }
+  }
+  //Compute the hydrostatic background state at vertical cell interfaces
+  for (k = 0; k < nz + 1; k++)
+  {
+    z = (k_beg + k) * dz;
+
+    //Set the fluid state based on the user's specification (default is injection in this example)
+    injection(0., z, r, u, w, t, hr, ht);
+
+    hy_dens_int[k] = hr;
+    hy_dens_theta_int[k] = hr * ht;
+    hy_pressure_int[k] = C0 * pow((hr * ht), gamm);
+  }
+}
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection(double x, double z, double &r, double &u, double &w, double &t, double &hr, double &ht)
+{
+  hydro_const_theta(z, hr, ht);
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+//Establish hydrstatic balance using constant potential temperature (thermally neutral atmosphere)
+//z is the input coordinate
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_theta(double z, double &r, double &t)
+{
+  const double theta0 = 300.; //Background potential temperature
+  const double exner0 = 1.;   //Surface-level Exner pressure
+  double p, exner, rt;
+  //Establish hydrostatic balance first using Exner pressure
+  t = theta0;                                //Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z
+  p = p0 * pow(exner, (cp / rd));            //Pressure at z
+  rt = pow((p / C0), (1. / gamm));           //rho*theta at z
+  r = rt / t;                                //Density at z
+}
+
+void finalize()
+{
+  free(state);
+  free(state_tmp);
+  free(flux);
+  free(tend);
+  free(hy_dens_cell);
+  free(hy_dens_theta_cell);
+  free(hy_dens_int);
+  free(hy_dens_theta_int);
+  free(hy_pressure_int);
+}

+ 0 - 0
hpc/miniprofiler/English/C/source_code/solutions/miniWeather_openacc_exr2.cpp


Some files were not shown because too many files changed in this diff