RONCC
/
NVIDIA-GPU-Bootcamp-Training


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
							/* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <iostream>
#include <sstream>

#include <omp.h>
#include <nvToolsExt.h>

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32

#define CUDA_RT_CALL(call)                                                                  \
    {                                                                                       \
        cudaError_t cudaStatus = call;                                                      \
        if (cudaSuccess != cudaStatus)                                                      \
            fprintf(stderr,                                                                 \
                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
                    "with "                                                                 \
                    "%s (%d).\n",                                                           \
                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
    }

constexpr int MAX_NUM_DEVICES = 32;

constexpr float tol = 1.0e-8;

const float PI = 2.0 * std::asin(1.0);

__global__ void initialize_boundaries(float*  a_new, float*  a, const float pi, const int offset, 
                    const int nx, const int my_ny, const int ny) {
    for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
        const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
        a[iy * nx + 0] = y0;
        a[iy * nx + (nx - 1)] = y0;
        a_new[iy * nx + 0] = y0;
        a_new[iy * nx + (nx - 1)] = y0;
    }
}

__global__ void jacobi_kernel(float*  a_new, const float*  a, float*  l2_norm, const int iy_start,
                              const int iy_end, const int nx) {
    int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
    int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
    __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
    unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;

    if (iy < iy_end && ix < (nx - 1)) {
    // Update grid point
        const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
                                     a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
        a_new[iy * nx + ix] = new_val;
    float residue = new_val - a[iy * nx + ix];
    // Set block-level L2 norm value for this grid point
    block_l2_sum[thread_index] = residue * residue;
    }
    else {
    block_l2_sum[thread_index] = 0;
    }
    // Reduce L2 norm for the block in parallel
    for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
    __syncthreads();
    if ((thread_index) % (2*stride) == 0) {
            block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
    }
    }
    // Atomically update global L2 norm with block-reduced L2 norm
    if (thread_index == 0) {
    atomicAdd(l2_norm, block_l2_sum[0]);
    }
}

int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
    int argval = default_val;
    char** itr = std::find(begin, end, arg);
    if (itr != end && ++itr != end) {
        std::istringstream inbuf(*itr);
        inbuf >> argval;
    }
    return argval;
}

int get_parsed_vals(char** begin, char **end, int* devices,
		const std::string& arg, const int default_val) {
    int numGPUs = default_val;
    char** itr = std::find(begin, end, arg);
    if (itr != end && ++itr != end) {
        numGPUs = 0;
        std::string dev_ids(*itr);
	int currpos = 0, nextpos = 0;
	do {
	    nextpos = dev_ids.find_first_of(",", currpos);
            devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
	    numGPUs++;
	    currpos = nextpos + 1;
        } while (nextpos != std::string::npos);
    }
    else {
        for (int i = 0; i < numGPUs; i++) {
            devices[i] = i;
	}
    }
    return numGPUs;
}

bool get_arg(char** begin, char** end, const std::string& arg) {
    char** itr = std::find(begin, end, arg);
    if (itr != end) {
        return true;
    }
    return false;
}

double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);

int main(int argc, char* argv[]) {
    const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
    const int nx = get_argval(argv, argv + argc, "-nx", 16384);
    const int ny = get_argval(argv, argv + argc, "-ny", 16384);
    const bool p2p = get_arg(argv, argv + argc, "-p2p");
    
    // Get GPU mapping from runtime arguments
    int available_devices = 0;
    CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
    int devices[MAX_NUM_DEVICES];
    int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);

    float* a[MAX_NUM_DEVICES];
    float* a_new[MAX_NUM_DEVICES];
    float* a_ref_h;
    float* a_h;
    double runtime_serial = 0.0;

    float* l2_norm_d[MAX_NUM_DEVICES];
    float* l2_norm_h[MAX_NUM_DEVICES];

    int iy_start[MAX_NUM_DEVICES];
    int iy_end[MAX_NUM_DEVICES];

    int chunk_size[MAX_NUM_DEVICES];

    // Compute chunk size and allocate memory on GPUs
    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
        CUDA_RT_CALL(cudaFree(0));

        if (0 == dev_id) {
	        // Allocate memory on host and record single-GPU timings
            CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
            CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
            runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
        }

        // ny - 2 rows are distributed amongst `size` ranks in such a way
        // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
        // This optimizes load balancing when (ny - 2) % size != 0
        int chunk_size_low = (ny - 2) / num_devices;
        int chunk_size_high = chunk_size_low + 1;

        // To calculate the number of ranks that need to compute an extra row,
        // the following formula is derived from this equation:
        // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
        int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);  

        if (dev_id < num_ranks_low)
            chunk_size[dev_id] = chunk_size_low;
        else
            chunk_size[dev_id] = chunk_size_high;

	    // Allocate memory on GPU
        CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
        CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));

        CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
        CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));

        // Calculate local domain boundaries
        int iy_start_global;  // My start index in the global array
        if (dev_id < num_ranks_low) {
            iy_start_global = dev_id * chunk_size_low + 1;
        } else {
            iy_start_global =
                num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
        }

        iy_start[dev_id] = 1;
        iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];

        // Set dirichlet boundary conditions on left and right boarder
        initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
            a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
        CUDA_RT_CALL(cudaGetLastError());
        CUDA_RT_CALL(cudaDeviceSynchronize());

        CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
        CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));

        if (p2p == true) {
            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
            int canAccessPeer = 0;
            // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]" 
            //Fill and uncomment line below
            //CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
            if (canAccessPeer) {
            // TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]" 
            //Fill and uncomment line below
                //CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
            }
            const int bottom = (dev_id + 1) % num_devices;
            if (top != bottom) {
                canAccessPeer = 0;
                // TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
                // "devices[bottom]", whenever possible. Fill and uncomment line below
                //CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
                if (canAccessPeer) {
                    //CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
                }
            }
        }
        CUDA_RT_CALL(cudaDeviceSynchronize());
    }

    // Share initial top and bottom local grid-point values between neighbours
    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
        const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
        const int bottom = (dev_id + 1) % num_devices;
        CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
                     a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
                     cudaMemcpyDeviceToDevice));
        CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
                     nx * sizeof(float), cudaMemcpyDeviceToDevice));
    }

    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
        CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
        CUDA_RT_CALL(cudaDeviceSynchronize());
    }

    printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);

    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
    int iter = 0;
    float l2_norm = 1.0;

    double start = omp_get_wtime();
    nvtxRangePush("Jacobi solve");
    while (l2_norm > tol && iter < iter_max) {
	    // Launch device kernel on each GPU
        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
            // TODO: Part 1- Set current GPU to be "devices[dev_id]"
            CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));

            CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
            dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
                          (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
	    
            // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
            // blocks per thread. "dev_id" variable points to corresponding memory allocated 
            // for the current GPU.
            jacobi_kernel<<</*Fill me*/, /*Fill me*/>>>(/*Fill me*/);

            // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float), /*Fill me*/));
	}
    // Launch async memory copy operations for halo exchange and 
	// for copying local-grid L2 norm from each GPU to host
	for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
            const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
            const int bottom = (dev_id + 1) % num_devices;
            
            // TODO: Part 1- Set current GPU
            CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));

            // TODO: Part 1- Implement halo exchange with top neighbour "top"
            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
	    
            // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
            CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
        }
        l2_norm = 0.0;
        // Synchronize devices and compute global L2 norm
        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
            // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
	        CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
            CUDA_RT_CALL(/*Fill me*/);

            l2_norm += *(l2_norm_h[dev_id]);
        }

        l2_norm = std::sqrt(l2_norm);
        
	iter++;
        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);

        for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
            std::swap(a_new[dev_id], a[dev_id]);
        }
    }

    nvtxRangePop();
    double stop = omp_get_wtime();

    int offset = nx;
    // Copy computed grid back to host from each GPU
    for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
        CUDA_RT_CALL(
            cudaMemcpy(a_h + offset, a[dev_id] + nx,
                       std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
                       cudaMemcpyDeviceToHost));
        offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
    }

    // Compare against single GPU execution for correctness
    bool result_correct = true;
    for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
        for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
            if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
                fprintf(stderr,
                        "ERROR: a[%d * %d + %d] = %f does not match %f "
                        "(reference)\n",
                        iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
                result_correct = false;
            }
        }
    }

    if (result_correct) {
        printf("Num GPUs: %d. Using GPU ID: ", num_devices);
	for (int i = 0; i < num_devices; i++) {
            printf("%d, ", devices[i]);
	}
        printf(
	    "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
            "efficiency: %8.2f \n",
            ny, nx, runtime_serial, num_devices, (stop - start),
            runtime_serial / (stop - start),
            runtime_serial / (num_devices * (stop - start)) * 100);
    }

    for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
        CUDA_RT_CALL(cudaSetDevice(dev_id));

        CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
        CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));

        CUDA_RT_CALL(cudaFree(a_new[dev_id]));
        CUDA_RT_CALL(cudaFree(a[dev_id]));
        if (0 == dev_id) {
            CUDA_RT_CALL(cudaFreeHost(a_h));
            CUDA_RT_CALL(cudaFreeHost(a_ref_h));
        }
    }

    return result_correct ? 0 : 1;
}

double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
    float* a;
    float* a_new;

    float* l2_norm_d;
    float* l2_norm_h;

    int iy_start = 1;
    int iy_end = (ny - 1);

    CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
    CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));

    CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
    CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));

    // Set diriclet boundary conditions on left and right boarder
    nvtxRangePush("Init boundaries");
    initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
    CUDA_RT_CALL(cudaGetLastError());
    CUDA_RT_CALL(cudaDeviceSynchronize());
    nvtxRangePop();

    CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
    CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));

    CUDA_RT_CALL(cudaDeviceSynchronize());

    printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);

    dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
    dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);

    int iter = 0;
    float l2_norm = 1.0;

    double start = omp_get_wtime();
    nvtxRangePush("Jacobi Solve");
    while (l2_norm > tol && iter < iter_max) {
        CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));

       // Compute grid points for this iteration
        jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
        CUDA_RT_CALL(cudaGetLastError());
        CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));

        // Apply periodic boundary conditions
        CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
                                     cudaMemcpyDeviceToDevice));
        CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
                                     cudaMemcpyDeviceToDevice));

        CUDA_RT_CALL(cudaDeviceSynchronize());
        l2_norm = *l2_norm_h;
        l2_norm = std::sqrt(l2_norm);

    	iter++;
    	if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);

        std::swap(a_new, a);

    }
    nvtxRangePop();
    double stop = omp_get_wtime();

    CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));

    CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
    CUDA_RT_CALL(cudaFree(l2_norm_d));

    CUDA_RT_CALL(cudaFree(a_new));
    CUDA_RT_CALL(cudaFree(a));
    return (stop - start);
}