|
@@ -31,36 +31,10 @@
|
|
|
#include <sstream>
|
|
|
|
|
|
#include <omp.h>
|
|
|
-
|
|
|
-#ifdef HAVE_CUB
|
|
|
-#include <cub/block/block_reduce.cuh>
|
|
|
-#endif // HAVE_CUB
|
|
|
-
|
|
|
-#ifdef USE_NVTX
|
|
|
#include <nvToolsExt.h>
|
|
|
|
|
|
-const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff,
|
|
|
- 0x0000ffff, 0x00ff0000, 0x00ffffff};
|
|
|
-const int num_colors = sizeof(colors) / sizeof(uint32_t);
|
|
|
-
|
|
|
-#define PUSH_RANGE(name, cid) \
|
|
|
- { \
|
|
|
- int color_id = cid; \
|
|
|
- color_id = color_id % num_colors; \
|
|
|
- nvtxEventAttributes_t eventAttrib = {0}; \
|
|
|
- eventAttrib.version = NVTX_VERSION; \
|
|
|
- eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
|
|
|
- eventAttrib.colorType = NVTX_COLOR_ARGB; \
|
|
|
- eventAttrib.color = colors[color_id]; \
|
|
|
- eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
|
|
|
- eventAttrib.message.ascii = name; \
|
|
|
- nvtxRangePushEx(&eventAttrib); \
|
|
|
- }
|
|
|
-#define POP_RANGE nvtxRangePop();
|
|
|
-#else
|
|
|
-#define PUSH_RANGE(name, cid)
|
|
|
-#define POP_RANGE
|
|
|
-#endif
|
|
|
+#define BLOCK_DIM_X 32
|
|
|
+#define BLOCK_DIM_Y 32
|
|
|
|
|
|
#define CUDA_RT_CALL(call) \
|
|
|
{ \
|
|
@@ -75,16 +49,14 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
|
|
|
|
|
|
constexpr int MAX_NUM_DEVICES = 32;
|
|
|
|
|
|
-typedef float real;
|
|
|
-constexpr real tol = 1.0e-8;
|
|
|
+constexpr float tol = 1.0e-8;
|
|
|
|
|
|
-const real PI = 2.0 * std::asin(1.0);
|
|
|
+const float PI = 2.0 * std::asin(1.0);
|
|
|
|
|
|
-__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a,
|
|
|
- const real pi, const int offset, const int nx,
|
|
|
- const int my_ny, const int ny) {
|
|
|
+__global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset,
|
|
|
+ const int nx, const int my_ny, const int ny) {
|
|
|
for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
|
|
|
- const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
|
|
|
+ const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
|
|
|
a[iy * nx + 0] = y0;
|
|
|
a[iy * nx + (nx - 1)] = y0;
|
|
|
a_new[iy * nx + 0] = y0;
|
|
@@ -92,44 +64,40 @@ __global__ void initialize_boundaries(real* __restrict__ const a_new, real* __re
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
|
|
|
-__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a,
|
|
|
- real* __restrict__ const l2_norm, const int iy_start,
|
|
|
- const int iy_end, const int nx, const bool calculate_norm) {
|
|
|
-#ifdef HAVE_CUB
|
|
|
- typedef cub::BlockReduce<real, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
|
|
|
- BlockReduce;
|
|
|
- __shared__ typename BlockReduce::TempStorage temp_storage;
|
|
|
-#endif // HAVE_CUB
|
|
|
+__global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start,
|
|
|
+ const int iy_end, const int nx) {
|
|
|
int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
|
|
|
int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
|
|
|
- real local_l2_norm = 0.0;
|
|
|
+ __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
|
|
|
+ unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
|
|
|
|
|
|
if (iy < iy_end && ix < (nx - 1)) {
|
|
|
- const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
|
|
|
+ // Update grid point
|
|
|
+ const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
|
|
|
a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
|
|
|
a_new[iy * nx + ix] = new_val;
|
|
|
- if (calculate_norm) {
|
|
|
- real residue = new_val - a[iy * nx + ix];
|
|
|
- local_l2_norm += residue * residue;
|
|
|
- }
|
|
|
+ float residue = new_val - a[iy * nx + ix];
|
|
|
+ // Set block-level L2 norm value for this grid point
|
|
|
+ block_l2_sum[thread_index] = residue * residue;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ block_l2_sum[thread_index] = 0;
|
|
|
}
|
|
|
- if (calculate_norm) {
|
|
|
-#ifdef HAVE_CUB
|
|
|
- real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm);
|
|
|
- if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm);
|
|
|
-#else
|
|
|
- atomicAdd(l2_norm, local_l2_norm);
|
|
|
-#endif // HAVE_CUB
|
|
|
+ // Reduce L2 norm for the block in parallel
|
|
|
+ for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
|
|
|
+ __syncthreads();
|
|
|
+ if ((thread_index) % (2*stride) == 0) {
|
|
|
+ block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Atomically update global L2 norm with block-reduced L2 norm
|
|
|
+ if (thread_index == 0) {
|
|
|
+ atomicAdd(l2_norm, block_l2_sum[0]);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
|
|
|
- const int nccheck, const bool print);
|
|
|
-
|
|
|
-template <typename T>
|
|
|
-T get_argval(char** begin, char** end, const std::string& arg, const T default_val) {
|
|
|
- T argval = default_val;
|
|
|
+int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
|
|
|
+ int argval = default_val;
|
|
|
char** itr = std::find(begin, end, arg);
|
|
|
if (itr != end && ++itr != end) {
|
|
|
std::istringstream inbuf(*itr);
|
|
@@ -146,18 +114,18 @@ bool get_arg(char** begin, char** end, const std::string& arg) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
|
|
|
+
|
|
|
int main(int argc, char* argv[]) {
|
|
|
- const int iter_max = get_argval<int>(argv, argv + argc, "-niter", 1000);
|
|
|
- const int nccheck = get_argval<int>(argv, argv + argc, "-nccheck", 1);
|
|
|
- const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
|
|
|
- const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
|
|
|
- const bool csv = get_arg(argv, argv + argc, "-csv");
|
|
|
- const bool nop2p = get_arg(argv, argv + argc, "-nop2p");
|
|
|
-
|
|
|
- real* a[MAX_NUM_DEVICES];
|
|
|
- real* a_new[MAX_NUM_DEVICES];
|
|
|
- real* a_ref_h;
|
|
|
- real* a_h;
|
|
|
+ const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
|
|
|
+ const int nx = get_argval(argv, argv + argc, "-nx", 16384);
|
|
|
+ const int ny = get_argval(argv, argv + argc, "-ny", 16384);
|
|
|
+ const bool p2p = get_arg(argv, argv + argc, "-p2p");
|
|
|
+
|
|
|
+ float* a[MAX_NUM_DEVICES];
|
|
|
+ float* a_new[MAX_NUM_DEVICES];
|
|
|
+ float* a_ref_h;
|
|
|
+ float* a_h;
|
|
|
double runtime_serial = 0.0;
|
|
|
|
|
|
cudaStream_t compute_stream[MAX_NUM_DEVICES];
|
|
@@ -167,8 +135,8 @@ int main(int argc, char* argv[]) {
|
|
|
cudaEvent_t push_top_done[2][MAX_NUM_DEVICES];
|
|
|
cudaEvent_t push_bottom_done[2][MAX_NUM_DEVICES];
|
|
|
|
|
|
- real* l2_norm_d[MAX_NUM_DEVICES];
|
|
|
- real* l2_norm_h[MAX_NUM_DEVICES];
|
|
|
+ float* l2_norm_d[MAX_NUM_DEVICES];
|
|
|
+ float* l2_norm_h[MAX_NUM_DEVICES];
|
|
|
|
|
|
int iy_start[MAX_NUM_DEVICES];
|
|
|
int iy_end[MAX_NUM_DEVICES];
|
|
@@ -182,9 +150,9 @@ int main(int argc, char* argv[]) {
|
|
|
CUDA_RT_CALL(cudaFree(0));
|
|
|
|
|
|
if (0 == dev_id) {
|
|
|
- CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
|
|
|
- runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv);
|
|
|
+ CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
|
|
|
+ runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
|
|
|
}
|
|
|
|
|
|
// ny - 2 rows are distributed amongst `size` ranks in such a way
|
|
@@ -192,21 +160,22 @@ int main(int argc, char* argv[]) {
|
|
|
// This optimizes load balancing when (ny - 2) % size != 0
|
|
|
int chunk_size_low = (ny - 2) / num_devices;
|
|
|
int chunk_size_high = chunk_size_low + 1;
|
|
|
+
|
|
|
// To calculate the number of ranks that need to compute an extra row,
|
|
|
// the following formula is derived from this equation:
|
|
|
- // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2
|
|
|
- int num_ranks_low = num_devices * chunk_size_low + num_devices -
|
|
|
- (ny - 2); // Number of ranks with chunk_size = chunk_size_low
|
|
|
- if (dev_id < num_ranks_low)
|
|
|
+ // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
|
|
|
+ int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);
|
|
|
+
|
|
|
+ if (dev_id < num_ranks_low)
|
|
|
chunk_size[dev_id] = chunk_size_low;
|
|
|
else
|
|
|
chunk_size[dev_id] = chunk_size_high;
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
|
|
|
|
|
|
// Calculate local domain boundaries
|
|
|
int iy_start_global; // My start index in the global array
|
|
@@ -229,18 +198,18 @@ int main(int argc, char* argv[]) {
|
|
|
CUDA_RT_CALL(cudaStreamCreate(compute_stream + dev_id));
|
|
|
CUDA_RT_CALL(cudaStreamCreate(push_top_stream + dev_id));
|
|
|
CUDA_RT_CALL(cudaStreamCreate(push_bottom_stream + dev_id));
|
|
|
- CUDA_RT_CALL(cudaEventCreateWithFlags(compute_done + dev_id, cudaEventDisableTiming));
|
|
|
- CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[0] + dev_id, cudaEventDisableTiming));
|
|
|
+ CUDA_RT_CALL(cudaEventCreate(compute_done + dev_id));
|
|
|
+ CUDA_RT_CALL(cudaEventCreate(push_top_done[0] + dev_id));
|
|
|
CUDA_RT_CALL(
|
|
|
- cudaEventCreateWithFlags(push_bottom_done[0] + dev_id, cudaEventDisableTiming));
|
|
|
- CUDA_RT_CALL(cudaEventCreateWithFlags(push_top_done[1] + dev_id, cudaEventDisableTiming));
|
|
|
+ cudaEventCreate(push_bottom_done[0] + dev_id));
|
|
|
+ CUDA_RT_CALL(cudaEventCreate(push_top_done[1] + dev_id));
|
|
|
CUDA_RT_CALL(
|
|
|
- cudaEventCreateWithFlags(push_bottom_done[1] + dev_id, cudaEventDisableTiming));
|
|
|
+ cudaEventCreate(push_bottom_done[1] + dev_id));
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
|
|
|
|
|
|
- if (!nop2p) {
|
|
|
+ if (p2p == true) {
|
|
|
const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
|
|
|
int canAccessPeer = 0;
|
|
|
CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, dev_id, top));
|
|
@@ -264,31 +233,25 @@ int main(int argc, char* argv[]) {
|
|
|
const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
|
|
|
const int bottom = (dev_id + 1) % num_devices;
|
|
|
CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
|
|
|
- a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(real),
|
|
|
- cudaMemcpyDeviceToDevice));
|
|
|
+ a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
|
|
|
+ cudaMemcpyDeviceToDevice));
|
|
|
CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
|
|
|
- nx * sizeof(real), cudaMemcpyDeviceToDevice));
|
|
|
+ nx * sizeof(float), cudaMemcpyDeviceToDevice));
|
|
|
}
|
|
|
|
|
|
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
CUDA_RT_CALL(cudaSetDevice(dev_id));
|
|
|
- CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
+ CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
}
|
|
|
|
|
|
- if (!csv)
|
|
|
- printf(
|
|
|
- "Jacobi relaxation: %d iterations on %d x %d mesh with norm check "
|
|
|
- "every %d iterations\n",
|
|
|
- iter_max, ny, nx, nccheck);
|
|
|
+ printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
|
|
|
|
|
|
- constexpr int dim_block_x = 32;
|
|
|
- constexpr int dim_block_y = 32;
|
|
|
+ dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
|
|
int iter = 0;
|
|
|
- bool calculate_norm;
|
|
|
- real l2_norm = 1.0;
|
|
|
+ float l2_norm = 1.0;
|
|
|
|
|
|
double start = omp_get_wtime();
|
|
|
- PUSH_RANGE("Jacobi solve", 0)
|
|
|
+ nvtxRangePush("Jacobi solve");
|
|
|
while (l2_norm > tol && iter < iter_max) {
|
|
|
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
|
|
@@ -296,59 +259,59 @@ int main(int argc, char* argv[]) {
|
|
|
CUDA_RT_CALL(cudaSetDevice(dev_id));
|
|
|
|
|
|
CUDA_RT_CALL(
|
|
|
- cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(real), compute_stream[dev_id]));
|
|
|
+ cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float), compute_stream[dev_id]));
|
|
|
|
|
|
CUDA_RT_CALL(
|
|
|
cudaStreamWaitEvent(compute_stream[dev_id], push_top_done[(iter % 2)][bottom], 0));
|
|
|
CUDA_RT_CALL(
|
|
|
cudaStreamWaitEvent(compute_stream[dev_id], push_bottom_done[(iter % 2)][top], 0));
|
|
|
|
|
|
- calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);
|
|
|
- dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x,
|
|
|
- (chunk_size[dev_id] + dim_block_y - 1) / dim_block_y, 1);
|
|
|
+ dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
|
|
|
+ (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
|
|
|
|
|
|
- jacobi_kernel<dim_block_x, dim_block_y>
|
|
|
- <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, compute_stream[dev_id]>>>(
|
|
|
+ jacobi_kernel<<<dim_grid, dim_block, 0, compute_stream[dev_id]>>>(
|
|
|
a_new[dev_id], a[dev_id], l2_norm_d[dev_id], iy_start[dev_id], iy_end[dev_id],
|
|
|
- nx, calculate_norm);
|
|
|
- CUDA_RT_CALL(cudaGetLastError());
|
|
|
+ nx);
|
|
|
+
|
|
|
CUDA_RT_CALL(cudaEventRecord(compute_done[dev_id], compute_stream[dev_id]));
|
|
|
|
|
|
- if (calculate_norm) {
|
|
|
- CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(real),
|
|
|
- cudaMemcpyDeviceToHost, compute_stream[dev_id]));
|
|
|
- }
|
|
|
+ CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h[dev_id], l2_norm_d[dev_id], sizeof(float),
|
|
|
+ cudaMemcpyDeviceToHost, compute_stream[dev_id]));
|
|
|
+ }
|
|
|
+ for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
+ const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
|
|
|
+ const int bottom = (dev_id + 1) % num_devices;
|
|
|
+ CUDA_RT_CALL(cudaSetDevice(dev_id));
|
|
|
|
|
|
// Apply periodic boundary conditions
|
|
|
CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream[dev_id], compute_done[dev_id], 0));
|
|
|
CUDA_RT_CALL(cudaMemcpyAsync(a_new[top] + (iy_end[top] * nx),
|
|
|
- a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(real),
|
|
|
+ a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
|
|
|
cudaMemcpyDeviceToDevice, push_top_stream[dev_id]));
|
|
|
CUDA_RT_CALL(
|
|
|
cudaEventRecord(push_top_done[((iter + 1) % 2)][dev_id], push_top_stream[dev_id]));
|
|
|
|
|
|
CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream[dev_id], compute_done[dev_id], 0));
|
|
|
CUDA_RT_CALL(cudaMemcpyAsync(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
|
|
|
- nx * sizeof(real), cudaMemcpyDeviceToDevice,
|
|
|
+ nx * sizeof(float), cudaMemcpyDeviceToDevice,
|
|
|
push_bottom_stream[dev_id]));
|
|
|
CUDA_RT_CALL(cudaEventRecord(push_bottom_done[((iter + 1) % 2)][dev_id],
|
|
|
push_bottom_stream[dev_id]));
|
|
|
}
|
|
|
- if (calculate_norm) {
|
|
|
- l2_norm = 0.0;
|
|
|
- for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
- CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
|
|
|
- l2_norm += *(l2_norm_h[dev_id]);
|
|
|
- }
|
|
|
-
|
|
|
- l2_norm = std::sqrt(l2_norm);
|
|
|
- if (!csv && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
|
|
|
+ l2_norm = 0.0;
|
|
|
+ for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
+ CUDA_RT_CALL(cudaStreamSynchronize(compute_stream[dev_id]));
|
|
|
+ l2_norm += *(l2_norm_h[dev_id]);
|
|
|
}
|
|
|
|
|
|
+ l2_norm = std::sqrt(l2_norm);
|
|
|
+
|
|
|
+ iter++;
|
|
|
+ if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
|
|
|
+
|
|
|
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
std::swap(a_new[dev_id], a[dev_id]);
|
|
|
}
|
|
|
- iter++;
|
|
|
}
|
|
|
|
|
|
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
@@ -356,14 +319,14 @@ int main(int argc, char* argv[]) {
|
|
|
CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
}
|
|
|
|
|
|
- POP_RANGE
|
|
|
+ nvtxRangePop();
|
|
|
double stop = omp_get_wtime();
|
|
|
|
|
|
int offset = nx;
|
|
|
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
|
|
|
CUDA_RT_CALL(
|
|
|
cudaMemcpy(a_h + offset, a[dev_id] + nx,
|
|
|
- std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(real),
|
|
|
+ std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
|
|
|
cudaMemcpyDeviceToHost));
|
|
|
offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
|
|
|
}
|
|
@@ -382,18 +345,13 @@ int main(int argc, char* argv[]) {
|
|
|
}
|
|
|
|
|
|
if (result_correct) {
|
|
|
- if (csv) {
|
|
|
- printf("single_threaded_copy, %d, %d, %d, %d, %d, %d, %f, %f\n", nx, ny, iter_max,
|
|
|
- nccheck, num_devices, nop2p ? 0 : 1, (stop - start), runtime_serial);
|
|
|
- } else {
|
|
|
- printf("Num GPUs: %d.\n", num_devices);
|
|
|
- printf(
|
|
|
- "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
|
|
|
- "efficiency: %8.2f \n",
|
|
|
- ny, nx, runtime_serial, num_devices, (stop - start),
|
|
|
- runtime_serial / (stop - start),
|
|
|
- runtime_serial / (num_devices * (stop - start)) * 100);
|
|
|
- }
|
|
|
+ printf("Num GPUs: %d.\n", num_devices);
|
|
|
+ printf(
|
|
|
+ "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
|
|
|
+ "efficiency: %8.2f \n",
|
|
|
+ ny, nx, runtime_serial, num_devices, (stop - start),
|
|
|
+ runtime_serial / (stop - start),
|
|
|
+ runtime_serial / (num_devices * (stop - start)) * 100);
|
|
|
}
|
|
|
|
|
|
for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
|
|
@@ -421,85 +379,71 @@ int main(int argc, char* argv[]) {
|
|
|
return result_correct ? 0 : 1;
|
|
|
}
|
|
|
|
|
|
-double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h,
|
|
|
- const int nccheck, const bool print) {
|
|
|
- real* a;
|
|
|
- real* a_new;
|
|
|
+double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
|
|
|
+ float* a;
|
|
|
+ float* a_new;
|
|
|
|
|
|
- real* l2_norm_d;
|
|
|
- real* l2_norm_h;
|
|
|
+ float* l2_norm_d;
|
|
|
+ float* l2_norm_h;
|
|
|
|
|
|
int iy_start = 1;
|
|
|
int iy_end = (ny - 1);
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
|
|
|
|
|
|
// Set diriclet boundary conditions on left and right boarder
|
|
|
+ nvtxRangePush("Init boundaries");
|
|
|
initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
|
|
|
CUDA_RT_CALL(cudaGetLastError());
|
|
|
CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
+ nvtxRangePop();
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real)));
|
|
|
- CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
|
|
|
+ CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
|
|
|
|
|
|
CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
|
|
|
- if (print)
|
|
|
- printf(
|
|
|
- "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with "
|
|
|
- "norm "
|
|
|
- "check every %d iterations\n",
|
|
|
- iter_max, ny, nx, nccheck);
|
|
|
+ printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
|
|
|
|
|
|
- constexpr int dim_block_x = 32;
|
|
|
- constexpr int dim_block_y = 32;
|
|
|
- dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, (ny + dim_block_y - 1) / dim_block_y, 1);
|
|
|
+ dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
|
|
|
+ dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
|
|
|
|
|
|
int iter = 0;
|
|
|
- bool calculate_norm;
|
|
|
- real l2_norm = 1.0;
|
|
|
+ float l2_norm = 1.0;
|
|
|
|
|
|
double start = omp_get_wtime();
|
|
|
- PUSH_RANGE("Jacobi solve", 0)
|
|
|
+ nvtxRangePush("Jacobi Solve");
|
|
|
while (l2_norm > tol && iter < iter_max) {
|
|
|
- CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(real)));
|
|
|
+ CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
|
|
|
|
|
|
- calculate_norm = (iter % nccheck) == 0 || (print && ((iter % 100) == 0));
|
|
|
- jacobi_kernel<dim_block_x, dim_block_y>
|
|
|
- <<<dim_grid, {dim_block_x, dim_block_y, 1}, 0, 0>>>(
|
|
|
- a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm);
|
|
|
+ // Compute grid points for this iteration
|
|
|
+ jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
|
|
|
CUDA_RT_CALL(cudaGetLastError());
|
|
|
-
|
|
|
- if (calculate_norm) {
|
|
|
- CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost));
|
|
|
- }
|
|
|
+ CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
|
|
|
|
|
|
// Apply periodic boundary conditions
|
|
|
-
|
|
|
- CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real),
|
|
|
+ CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
|
|
|
cudaMemcpyDeviceToDevice));
|
|
|
- CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real),
|
|
|
+ CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
|
|
|
cudaMemcpyDeviceToDevice));
|
|
|
|
|
|
- if (calculate_norm) {
|
|
|
- CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
- //CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
|
|
|
- l2_norm = *l2_norm_h;
|
|
|
- l2_norm = std::sqrt(l2_norm);
|
|
|
- if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
|
|
|
- }
|
|
|
+ CUDA_RT_CALL(cudaDeviceSynchronize());
|
|
|
+ l2_norm = *l2_norm_h;
|
|
|
+ l2_norm = std::sqrt(l2_norm);
|
|
|
|
|
|
- std::swap(a_new, a);
|
|
|
iter++;
|
|
|
+ if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
|
|
|
+
|
|
|
+ std::swap(a_new, a);
|
|
|
}
|
|
|
- POP_RANGE
|
|
|
+ nvtxRangePop();
|
|
|
double stop = omp_get_wtime();
|
|
|
|
|
|
- CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost));
|
|
|
+ CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
|
|
|
|
|
|
CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
|
|
|
CUDA_RT_CALL(cudaFree(l2_norm_d));
|
|
@@ -508,3 +452,4 @@ double single_gpu(const int nx, const int ny, const int iter_max, real* const a_
|
|
|
CUDA_RT_CALL(cudaFree(a));
|
|
|
return (stop - start);
|
|
|
}
|
|
|
+
|