jacobi_memcpy.cu 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. /* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  2. *
  3. * Redistribution and use in source and binary forms, with or without
  4. * modification, are permitted provided that the following conditions
  5. * are met:
  6. * * Redistributions of source code must retain the above copyright
  7. * notice, this list of conditions and the following disclaimer.
  8. * * Redistributions in binary form must reproduce the above copyright
  9. * notice, this list of conditions and the following disclaimer in the
  10. * documentation and/or other materials provided with the distribution.
  11. * * Neither the name of NVIDIA CORPORATION nor the names of its
  12. * contributors may be used to endorse or promote products derived
  13. * from this software without specific prior written permission.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include <algorithm>
  28. #include <cmath>
  29. #include <cstdio>
  30. #include <iostream>
  31. #include <sstream>
  32. #include <omp.h>
  33. #include <nvToolsExt.h>
  34. #define BLOCK_DIM_X 32
  35. #define BLOCK_DIM_Y 32
  36. #define CUDA_RT_CALL(call) \
  37. { \
  38. cudaError_t cudaStatus = call; \
  39. if (cudaSuccess != cudaStatus) \
  40. fprintf(stderr, \
  41. "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \
  42. "with " \
  43. "%s (%d).\n", \
  44. #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
  45. }
  46. constexpr int MAX_NUM_DEVICES = 32;
  47. constexpr float tol = 1.0e-8;
  48. const float PI = 2.0 * std::asin(1.0);
  49. __global__ void initialize_boundaries(float* a_new, float* a, const float pi, const int offset,
  50. const int nx, const int my_ny, const int ny) {
  51. for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) {
  52. const float y0 = sin(2.0 * pi * (offset + iy) / (ny - 1));
  53. a[iy * nx + 0] = y0;
  54. a[iy * nx + (nx - 1)] = y0;
  55. a_new[iy * nx + 0] = y0;
  56. a_new[iy * nx + (nx - 1)] = y0;
  57. }
  58. }
  59. __global__ void jacobi_kernel(float* a_new, const float* a, float* l2_norm, const int iy_start,
  60. const int iy_end, const int nx) {
  61. int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start;
  62. int ix = blockIdx.x * blockDim.x + threadIdx.x + 1;
  63. __shared__ float block_l2_sum[BLOCK_DIM_X*BLOCK_DIM_Y];
  64. unsigned thread_index = threadIdx.y*BLOCK_DIM_X + threadIdx.x;
  65. if (iy < iy_end && ix < (nx - 1)) {
  66. // Update grid point
  67. const float new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] +
  68. a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]);
  69. a_new[iy * nx + ix] = new_val;
  70. float residue = new_val - a[iy * nx + ix];
  71. // Set block-level L2 norm value for this grid point
  72. block_l2_sum[thread_index] = residue * residue;
  73. }
  74. else {
  75. block_l2_sum[thread_index] = 0;
  76. }
  77. // Reduce L2 norm for the block in parallel
  78. for (unsigned stride = 1; stride < BLOCK_DIM_X*BLOCK_DIM_Y; stride *= 2) {
  79. __syncthreads();
  80. if ((thread_index) % (2*stride) == 0) {
  81. block_l2_sum[thread_index] += block_l2_sum[thread_index + stride];
  82. }
  83. }
  84. // Atomically update global L2 norm with block-reduced L2 norm
  85. if (thread_index == 0) {
  86. atomicAdd(l2_norm, block_l2_sum[0]);
  87. }
  88. }
  89. int get_argval(char** begin, char** end, const std::string& arg, const int default_val) {
  90. int argval = default_val;
  91. char** itr = std::find(begin, end, arg);
  92. if (itr != end && ++itr != end) {
  93. std::istringstream inbuf(*itr);
  94. inbuf >> argval;
  95. }
  96. return argval;
  97. }
  98. int get_parsed_vals(char** begin, char **end, int* devices,
  99. const std::string& arg, const int default_val) {
  100. int numGPUs = default_val;
  101. char** itr = std::find(begin, end, arg);
  102. if (itr != end && ++itr != end) {
  103. numGPUs = 0;
  104. std::string dev_ids(*itr);
  105. int currpos = 0, nextpos = 0;
  106. do {
  107. nextpos = dev_ids.find_first_of(",", currpos);
  108. devices[numGPUs] = stoi(dev_ids.substr(currpos, nextpos));
  109. numGPUs++;
  110. currpos = nextpos + 1;
  111. } while (nextpos != std::string::npos);
  112. }
  113. else {
  114. for (int i = 0; i < numGPUs; i++) {
  115. devices[i] = i;
  116. }
  117. }
  118. return numGPUs;
  119. }
  120. bool get_arg(char** begin, char** end, const std::string& arg) {
  121. char** itr = std::find(begin, end, arg);
  122. if (itr != end) {
  123. return true;
  124. }
  125. return false;
  126. }
  127. double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h);
  128. int main(int argc, char* argv[]) {
  129. const int iter_max = get_argval(argv, argv + argc, "-niter", 1000);
  130. const int nx = get_argval(argv, argv + argc, "-nx", 16384);
  131. const int ny = get_argval(argv, argv + argc, "-ny", 16384);
  132. const bool p2p = get_arg(argv, argv + argc, "-p2p");
  133. // Get GPU mapping from runtime arguments
  134. int available_devices = 0;
  135. CUDA_RT_CALL(cudaGetDeviceCount(&available_devices));
  136. int devices[MAX_NUM_DEVICES];
  137. int num_devices = get_parsed_vals(argv, argv + argc, devices, "-gpus", available_devices);
  138. float* a[MAX_NUM_DEVICES];
  139. float* a_new[MAX_NUM_DEVICES];
  140. float* a_ref_h;
  141. float* a_h;
  142. double runtime_serial = 0.0;
  143. float* l2_norm_d[MAX_NUM_DEVICES];
  144. float* l2_norm_h[MAX_NUM_DEVICES];
  145. int iy_start[MAX_NUM_DEVICES];
  146. int iy_end[MAX_NUM_DEVICES];
  147. int chunk_size[MAX_NUM_DEVICES];
  148. // Compute chunk size and allocate memory on GPUs
  149. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  150. CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
  151. CUDA_RT_CALL(cudaFree(0));
  152. if (0 == dev_id) {
  153. // Allocate memory on host and record single-GPU timings
  154. CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(float)));
  155. CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(float)));
  156. runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h);
  157. }
  158. // ny - 2 rows are distributed amongst `size` ranks in such a way
  159. // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows.
  160. // This optimizes load balancing when (ny - 2) % size != 0
  161. int chunk_size_low = (ny - 2) / num_devices;
  162. int chunk_size_high = chunk_size_low + 1;
  163. // To calculate the number of ranks that need to compute an extra row,
  164. // the following formula is derived from this equation:
  165. // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = (ny - 2)
  166. int num_ranks_low = num_devices * chunk_size_low + num_devices - (ny - 2);
  167. if (dev_id < num_ranks_low)
  168. chunk_size[dev_id] = chunk_size_low;
  169. else
  170. chunk_size[dev_id] = chunk_size_high;
  171. // Allocate memory on GPU
  172. CUDA_RT_CALL(cudaMalloc(a + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
  173. CUDA_RT_CALL(cudaMalloc(a_new + dev_id, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
  174. CUDA_RT_CALL(cudaMemset(a[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
  175. CUDA_RT_CALL(cudaMemset(a_new[dev_id], 0, nx * (chunk_size[dev_id] + 2) * sizeof(float)));
  176. // Calculate local domain boundaries
  177. int iy_start_global; // My start index in the global array
  178. if (dev_id < num_ranks_low) {
  179. iy_start_global = dev_id * chunk_size_low + 1;
  180. } else {
  181. iy_start_global =
  182. num_ranks_low * chunk_size_low + (dev_id - num_ranks_low) * chunk_size_high + 1;
  183. }
  184. iy_start[dev_id] = 1;
  185. iy_end[dev_id] = iy_start[dev_id] + chunk_size[dev_id];
  186. // Set dirichlet boundary conditions on left and right boarder
  187. initialize_boundaries<<<(ny / num_devices) / 128 + 1, 128>>>(
  188. a[dev_id], a_new[dev_id], PI, iy_start_global - 1, nx, (chunk_size[dev_id] + 2), ny);
  189. CUDA_RT_CALL(cudaGetLastError());
  190. CUDA_RT_CALL(cudaDeviceSynchronize());
  191. CUDA_RT_CALL(cudaMalloc(l2_norm_d + dev_id, sizeof(float)));
  192. CUDA_RT_CALL(cudaMallocHost(l2_norm_h + dev_id, sizeof(float)));
  193. if (p2p == true) {
  194. const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
  195. int canAccessPeer = 0;
  196. // TODO: Part 2- Check whether GPU "devices[dev_id]" can access peer "devices[top]"
  197. //Fill and uncomment line below
  198. //CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
  199. if (canAccessPeer) {
  200. // TODO: Part 2- Enable peer access from GPU "devices[dev_id]" to "devices[top]"
  201. //Fill and uncomment line below
  202. //CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
  203. }
  204. const int bottom = (dev_id + 1) % num_devices;
  205. if (top != bottom) {
  206. canAccessPeer = 0;
  207. // TODO: Part 2- Check and enable peer access from GPU "devices[dev_id]" to
  208. // "devices[bottom]", whenever possible. Fill and uncomment line below
  209. //CUDA_RT_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, /*Fill me*/, /*Fill me*/));
  210. if (canAccessPeer) {
  211. //CUDA_RT_CALL(cudaDeviceEnablePeerAccess(/*Fill me*/, 0));
  212. }
  213. }
  214. }
  215. CUDA_RT_CALL(cudaDeviceSynchronize());
  216. }
  217. // Share initial top and bottom local grid-point values between neighbours
  218. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  219. CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
  220. const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
  221. const int bottom = (dev_id + 1) % num_devices;
  222. CUDA_RT_CALL(cudaMemcpy(a_new[top] + (iy_end[top] * nx),
  223. a_new[dev_id] + iy_start[dev_id] * nx, nx * sizeof(float),
  224. cudaMemcpyDeviceToDevice));
  225. CUDA_RT_CALL(cudaMemcpy(a_new[bottom], a_new[dev_id] + (iy_end[dev_id] - 1) * nx,
  226. nx * sizeof(float), cudaMemcpyDeviceToDevice));
  227. }
  228. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  229. CUDA_RT_CALL(cudaSetDevice(devices[dev_id]));
  230. CUDA_RT_CALL(cudaDeviceSynchronize());
  231. }
  232. printf("Jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
  233. dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
  234. int iter = 0;
  235. float l2_norm = 1.0;
  236. double start = omp_get_wtime();
  237. nvtxRangePush("Jacobi solve");
  238. while (l2_norm > tol && iter < iter_max) {
  239. // Launch device kernel on each GPU
  240. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  241. // TODO: Part 1- Set current GPU to be "devices[dev_id]"
  242. CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
  243. CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d[dev_id], 0, sizeof(float)));
  244. dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
  245. (chunk_size[dev_id] + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
  246. // TODO: Part 1- Call Jacobi kernel with "dim_grid" blocks in grid and "dim_block"
  247. // blocks per thread. "dev_id" variable points to corresponding memory allocated
  248. // for the current GPU.
  249. jacobi_kernel<<</*Fill me*/, /*Fill me*/>>>(/*Fill me*/);
  250. // TODO: Part 1- Copy GPU-local L2 norm "l2_norm_d" back to CPU "l2_norm_h"
  251. CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, sizeof(float), /*Fill me*/));
  252. }
  253. // Launch async memory copy operations for halo exchange and
  254. // for copying local-grid L2 norm from each GPU to host
  255. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  256. const int top = dev_id > 0 ? dev_id - 1 : (num_devices - 1);
  257. const int bottom = (dev_id + 1) % num_devices;
  258. // TODO: Part 1- Set current GPU
  259. CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
  260. // TODO: Part 1- Implement halo exchange with top neighbour "top"
  261. CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
  262. // TODO: Part 1- Implement halo exchange with bottom neighbour "bottom"
  263. CUDA_RT_CALL(cudaMemcpyAsync(/*Fill me*/, /*Fill me*/, nx * sizeof(float), /*Fill me*/));
  264. }
  265. l2_norm = 0.0;
  266. // Synchronize devices and compute global L2 norm
  267. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  268. // TODO: part 1- Set current GPU and call cudaDeviceSynchronize()
  269. CUDA_RT_CALL(cudaSetDevice(/*Fill me*/));
  270. CUDA_RT_CALL(/*Fill me*/);
  271. l2_norm += *(l2_norm_h[dev_id]);
  272. }
  273. l2_norm = std::sqrt(l2_norm);
  274. iter++;
  275. if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
  276. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  277. std::swap(a_new[dev_id], a[dev_id]);
  278. }
  279. }
  280. nvtxRangePop();
  281. double stop = omp_get_wtime();
  282. int offset = nx;
  283. // Copy computed grid back to host from each GPU
  284. for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
  285. CUDA_RT_CALL(
  286. cudaMemcpy(a_h + offset, a[dev_id] + nx,
  287. std::min((nx * ny) - offset, nx * chunk_size[dev_id]) * sizeof(float),
  288. cudaMemcpyDeviceToHost));
  289. offset += std::min(chunk_size[dev_id] * nx, (nx * ny) - offset);
  290. }
  291. // Compare against single GPU execution for correctness
  292. bool result_correct = true;
  293. for (int iy = 1; result_correct && (iy < (ny - 1)); ++iy) {
  294. for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) {
  295. if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) {
  296. fprintf(stderr,
  297. "ERROR: a[%d * %d + %d] = %f does not match %f "
  298. "(reference)\n",
  299. iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]);
  300. result_correct = false;
  301. }
  302. }
  303. }
  304. if (result_correct) {
  305. printf("Num GPUs: %d. Using GPU ID: ", num_devices);
  306. for (int i = 0; i < num_devices; i++) {
  307. printf("%d, ", devices[i]);
  308. }
  309. printf(
  310. "\n%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, "
  311. "efficiency: %8.2f \n",
  312. ny, nx, runtime_serial, num_devices, (stop - start),
  313. runtime_serial / (stop - start),
  314. runtime_serial / (num_devices * (stop - start)) * 100);
  315. }
  316. for (int dev_id = (num_devices - 1); dev_id >= 0; --dev_id) {
  317. CUDA_RT_CALL(cudaSetDevice(dev_id));
  318. CUDA_RT_CALL(cudaFreeHost(l2_norm_h[dev_id]));
  319. CUDA_RT_CALL(cudaFree(l2_norm_d[dev_id]));
  320. CUDA_RT_CALL(cudaFree(a_new[dev_id]));
  321. CUDA_RT_CALL(cudaFree(a[dev_id]));
  322. if (0 == dev_id) {
  323. CUDA_RT_CALL(cudaFreeHost(a_h));
  324. CUDA_RT_CALL(cudaFreeHost(a_ref_h));
  325. }
  326. }
  327. return result_correct ? 0 : 1;
  328. }
  329. double single_gpu(const int nx, const int ny, const int iter_max, float* const a_ref_h) {
  330. float* a;
  331. float* a_new;
  332. float* l2_norm_d;
  333. float* l2_norm_h;
  334. int iy_start = 1;
  335. int iy_end = (ny - 1);
  336. CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(float)));
  337. CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(float)));
  338. CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(float)));
  339. CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(float)));
  340. // Set diriclet boundary conditions on left and right boarder
  341. nvtxRangePush("Init boundaries");
  342. initialize_boundaries<<<ny / 128 + 1, 128>>>(a, a_new, PI, 0, nx, ny, ny);
  343. CUDA_RT_CALL(cudaGetLastError());
  344. CUDA_RT_CALL(cudaDeviceSynchronize());
  345. nvtxRangePop();
  346. CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(float)));
  347. CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(float)));
  348. CUDA_RT_CALL(cudaDeviceSynchronize());
  349. printf("Single GPU jacobi relaxation: %d iterations on %d x %d mesh\n", iter_max, nx, ny);
  350. dim3 dim_grid((nx + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (ny + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1);
  351. dim3 dim_block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
  352. int iter = 0;
  353. float l2_norm = 1.0;
  354. double start = omp_get_wtime();
  355. nvtxRangePush("Jacobi Solve");
  356. while (l2_norm > tol && iter < iter_max) {
  357. CUDA_RT_CALL(cudaMemset(l2_norm_d, 0, sizeof(float)));
  358. // Compute grid points for this iteration
  359. jacobi_kernel<<<dim_grid, dim_block>>>(a_new, a, l2_norm_d, iy_start, iy_end, nx);
  360. CUDA_RT_CALL(cudaGetLastError());
  361. CUDA_RT_CALL(cudaMemcpy(l2_norm_h, l2_norm_d, sizeof(float), cudaMemcpyDeviceToHost));
  362. // Apply periodic boundary conditions
  363. CUDA_RT_CALL(cudaMemcpy(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(float),
  364. cudaMemcpyDeviceToDevice));
  365. CUDA_RT_CALL(cudaMemcpy(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(float),
  366. cudaMemcpyDeviceToDevice));
  367. CUDA_RT_CALL(cudaDeviceSynchronize());
  368. l2_norm = *l2_norm_h;
  369. l2_norm = std::sqrt(l2_norm);
  370. iter++;
  371. if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm);
  372. std::swap(a_new, a);
  373. }
  374. nvtxRangePop();
  375. double stop = omp_get_wtime();
  376. CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(float), cudaMemcpyDeviceToHost));
  377. CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
  378. CUDA_RT_CALL(cudaFree(l2_norm_d));
  379. CUDA_RT_CALL(cudaFree(a_new));
  380. CUDA_RT_CALL(cudaFree(a));
  381. return (stop - start);
  382. }