p2pBandwidthLatencyTest.cu 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2. *
  3. * Redistribution and use in source and binary forms, with or without
  4. * modification, are permitted provided that the following conditions
  5. * are met:
  6. * * Redistributions of source code must retain the above copyright
  7. * notice, this list of conditions and the following disclaimer.
  8. * * Redistributions in binary form must reproduce the above copyright
  9. * notice, this list of conditions and the following disclaimer in the
  10. * documentation and/or other materials provided with the distribution.
  11. * * Neither the name of NVIDIA CORPORATION nor the names of its
  12. * contributors may be used to endorse or promote products derived
  13. * from this software without specific prior written permission.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include <cstdio>
  28. #include <vector>
  29. #include <helper_cuda.h>
  30. #include <helper_timer.h>
  31. using namespace std;
  32. const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";
  33. typedef enum {
  34. P2P_WRITE = 0,
  35. P2P_READ = 1,
  36. } P2PDataTransfer;
  37. typedef enum {
  38. CE = 0,
  39. SM = 1,
  40. } P2PEngine;
  41. P2PEngine p2p_mechanism = CE; // By default use Copy Engine
  42. // Macro for checking cuda errors following a cuda launch or api call
  43. #define cudaCheckError() \
  44. { \
  45. cudaError_t e = cudaGetLastError(); \
  46. if (e != cudaSuccess) { \
  47. printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
  48. cudaGetErrorString(e)); \
  49. exit(EXIT_FAILURE); \
  50. } \
  51. }
  52. __global__ void delay(volatile int *flag,
  53. unsigned long long timeout_clocks = 10000000) {
  54. // Wait until the application notifies us that it has completed queuing up the
  55. // experiment, or timeout and exit, allowing the application to make progress
  56. long long int start_clock, sample_clock;
  57. start_clock = clock64();
  58. while (!*flag) {
  59. sample_clock = clock64();
  60. if (sample_clock - start_clock > timeout_clocks) {
  61. break;
  62. }
  63. }
  64. }
  65. // This kernel is for demonstration purposes only, not a performant kernel for
  66. // p2p transfers.
  67. __global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src,
  68. size_t num_elems) {
  69. size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
  70. size_t gridSize = blockDim.x * gridDim.x;
  71. #pragma unroll(5)
  72. for (size_t i = globalId; i < num_elems; i += gridSize) {
  73. dest[i] = src[i];
  74. }
  75. }
  76. ///////////////////////////////////////////////////////////////////////////
  77. // Print help screen
  78. ///////////////////////////////////////////////////////////////////////////
  79. void printHelp(void) {
  80. printf("Usage: p2pBandwidthLatencyTest [OPTION]...\n");
  81. printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
  82. printf("\n");
  83. printf("Options:\n");
  84. printf("--help\t\tDisplay this help menu\n");
  85. printf(
  86. "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "
  87. "corresponding results.\n \t\tDefault used is P2P write operation.\n");
  88. printf("--sm_copy Use SM intiated p2p transfers instead of Copy Engine\n");
  89. printf("--numElems=<NUM_OF_INT_ELEMS> Number of integer elements to be used in p2p copy.\n");
  90. }
  91. void checkP2Paccess(int numGPUs) {
  92. for (int i = 0; i < numGPUs; i++) {
  93. cudaSetDevice(i);
  94. cudaCheckError();
  95. for (int j = 0; j < numGPUs; j++) {
  96. int access;
  97. if (i != j) {
  98. cudaDeviceCanAccessPeer(&access, i, j);
  99. cudaCheckError();
  100. printf("Device=%d %s Access Peer Device=%d\n", i,
  101. access ? "CAN" : "CANNOT", j);
  102. }
  103. }
  104. }
  105. printf(
  106. "\n***NOTE: In case a device doesn't have P2P access to other one, it "
  107. "falls back to normal memcopy procedure.\nSo you can see lesser "
  108. "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
  109. }
  110. void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,
  111. int num_elems, int repeat, bool p2paccess,
  112. cudaStream_t streamToRun) {
  113. int blockSize = 0;
  114. int numBlocks = 0;
  115. cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
  116. cudaCheckError();
  117. if (p2p_mechanism == SM && p2paccess) {
  118. for (int r = 0; r < repeat; r++) {
  119. copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(
  120. (int4 *)dest, (int4 *)src, num_elems / 4);
  121. }
  122. } else {
  123. for (int r = 0; r < repeat; r++) {
  124. cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,
  125. sizeof(int) * num_elems, streamToRun);
  126. }
  127. }
  128. }
  129. void outputBandwidthMatrix(int numElems, int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  130. int repeat = 5;
  131. volatile int *flag = NULL;
  132. vector<int *> buffers(numGPUs);
  133. vector<int *> buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy
  134. vector<cudaEvent_t> start(numGPUs);
  135. vector<cudaEvent_t> stop(numGPUs);
  136. vector<cudaStream_t> stream(numGPUs);
  137. cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  138. cudaCheckError();
  139. for (int d = 0; d < numGPUs; d++) {
  140. cudaSetDevice(d);
  141. cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
  142. cudaMalloc(&buffers[d], numElems * sizeof(int));
  143. cudaCheckError();
  144. cudaMemset(buffers[d], 0, numElems * sizeof(int));
  145. cudaCheckError();
  146. cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
  147. cudaCheckError();
  148. cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
  149. cudaCheckError();
  150. cudaEventCreate(&start[d]);
  151. cudaCheckError();
  152. cudaEventCreate(&stop[d]);
  153. cudaCheckError();
  154. }
  155. vector<double> bandwidthMatrix(numGPUs * numGPUs);
  156. for (int i = 0; i < numGPUs; i++) {
  157. cudaSetDevice(i);
  158. for (int j = 0; j < numGPUs; j++) {
  159. int access = 0;
  160. if (p2p) {
  161. cudaDeviceCanAccessPeer(&access, i, j);
  162. if (access) {
  163. cudaDeviceEnablePeerAccess(j, 0);
  164. cudaCheckError();
  165. cudaSetDevice(j);
  166. cudaCheckError();
  167. cudaDeviceEnablePeerAccess(i, 0);
  168. cudaCheckError();
  169. cudaSetDevice(i);
  170. cudaCheckError();
  171. }
  172. }
  173. cudaStreamSynchronize(stream[i]);
  174. cudaCheckError();
  175. // Block the stream until all the work is queued up
  176. // DANGER! - cudaMemcpy*Async may infinitely block waiting for
  177. // room to push the operation, so keep the number of repeatitions
  178. // relatively low. Higher repeatitions will cause the delay kernel
  179. // to timeout and lead to unstable results.
  180. *flag = 0;
  181. delay<<<1, 1, 0, stream[i]>>>(flag);
  182. cudaCheckError();
  183. cudaEventRecord(start[i], stream[i]);
  184. cudaCheckError();
  185. if (i == j) {
  186. // Perform intra-GPU, D2D copies
  187. performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
  188. access, stream[i]);
  189. } else {
  190. if (p2p_method == P2P_WRITE) {
  191. performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
  192. stream[i]);
  193. } else {
  194. performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
  195. stream[i]);
  196. }
  197. }
  198. cudaEventRecord(stop[i], stream[i]);
  199. cudaCheckError();
  200. // Release the queued events
  201. *flag = 1;
  202. cudaStreamSynchronize(stream[i]);
  203. cudaCheckError();
  204. float time_ms;
  205. cudaEventElapsedTime(&time_ms, start[i], stop[i]);
  206. double time_s = time_ms / 1e3;
  207. double gb = numElems * sizeof(int) * repeat / (double)1e9;
  208. if (i == j) {
  209. gb *= 2; // must count both the read and the write here
  210. }
  211. bandwidthMatrix[i * numGPUs + j] = gb / time_s;
  212. if (p2p && access) {
  213. cudaDeviceDisablePeerAccess(j);
  214. cudaSetDevice(j);
  215. cudaDeviceDisablePeerAccess(i);
  216. cudaSetDevice(i);
  217. cudaCheckError();
  218. }
  219. }
  220. }
  221. printf(" D\\D");
  222. for (int j = 0; j < numGPUs; j++) {
  223. printf("%6d ", j);
  224. }
  225. printf("\n");
  226. for (int i = 0; i < numGPUs; i++) {
  227. printf("%6d ", i);
  228. for (int j = 0; j < numGPUs; j++) {
  229. printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
  230. }
  231. printf("\n");
  232. }
  233. for (int d = 0; d < numGPUs; d++) {
  234. cudaSetDevice(d);
  235. cudaFree(buffers[d]);
  236. cudaFree(buffersD2D[d]);
  237. cudaCheckError();
  238. cudaEventDestroy(start[d]);
  239. cudaCheckError();
  240. cudaEventDestroy(stop[d]);
  241. cudaCheckError();
  242. cudaStreamDestroy(stream[d]);
  243. cudaCheckError();
  244. }
  245. cudaFreeHost((void *)flag);
  246. cudaCheckError();
  247. }
  248. void outputBidirectionalBandwidthMatrix(int numElems, int numGPUs, bool p2p) {
  249. int repeat = 5;
  250. volatile int *flag = NULL;
  251. vector<int *> buffers(numGPUs);
  252. vector<int *> buffersD2D(numGPUs);
  253. vector<cudaEvent_t> start(numGPUs);
  254. vector<cudaEvent_t> stop(numGPUs);
  255. vector<cudaStream_t> stream0(numGPUs);
  256. vector<cudaStream_t> stream1(numGPUs);
  257. cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  258. cudaCheckError();
  259. for (int d = 0; d < numGPUs; d++) {
  260. cudaSetDevice(d);
  261. cudaMalloc(&buffers[d], numElems * sizeof(int));
  262. cudaMemset(buffers[d], 0, numElems * sizeof(int));
  263. cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
  264. cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
  265. cudaCheckError();
  266. cudaEventCreate(&start[d]);
  267. cudaCheckError();
  268. cudaEventCreate(&stop[d]);
  269. cudaCheckError();
  270. cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
  271. cudaCheckError();
  272. cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
  273. cudaCheckError();
  274. }
  275. vector<double> bandwidthMatrix(numGPUs * numGPUs);
  276. for (int i = 0; i < numGPUs; i++) {
  277. cudaSetDevice(i);
  278. for (int j = 0; j < numGPUs; j++) {
  279. int access = 0;
  280. if (p2p) {
  281. cudaDeviceCanAccessPeer(&access, i, j);
  282. if (access) {
  283. cudaSetDevice(i);
  284. cudaDeviceEnablePeerAccess(j, 0);
  285. cudaCheckError();
  286. cudaSetDevice(j);
  287. cudaDeviceEnablePeerAccess(i, 0);
  288. cudaCheckError();
  289. }
  290. }
  291. cudaSetDevice(i);
  292. cudaStreamSynchronize(stream0[i]);
  293. cudaStreamSynchronize(stream1[j]);
  294. cudaCheckError();
  295. // Block the stream until all the work is queued up
  296. // DANGER! - cudaMemcpy*Async may infinitely block waiting for
  297. // room to push the operation, so keep the number of repeatitions
  298. // relatively low. Higher repeatitions will cause the delay kernel
  299. // to timeout and lead to unstable results.
  300. *flag = 0;
  301. cudaSetDevice(i);
  302. // No need to block stream1 since it'll be blocked on stream0's event
  303. delay<<<1, 1, 0, stream0[i]>>>(flag);
  304. cudaCheckError();
  305. // Force stream1 not to start until stream0 does, in order to ensure
  306. // the events on stream0 fully encompass the time needed for all
  307. // operations
  308. cudaEventRecord(start[i], stream0[i]);
  309. cudaStreamWaitEvent(stream1[j], start[i], 0);
  310. if (i == j) {
  311. // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
  312. performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
  313. access, stream0[i]);
  314. performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,
  315. access, stream1[i]);
  316. } else {
  317. if (access && p2p_mechanism == SM) {
  318. cudaSetDevice(j);
  319. }
  320. performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
  321. stream1[j]);
  322. if (access && p2p_mechanism == SM) {
  323. cudaSetDevice(i);
  324. }
  325. performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
  326. stream0[i]);
  327. }
  328. // Notify stream0 that stream1 is complete and record the time of
  329. // the total transaction
  330. cudaEventRecord(stop[j], stream1[j]);
  331. cudaStreamWaitEvent(stream0[i], stop[j], 0);
  332. cudaEventRecord(stop[i], stream0[i]);
  333. // Release the queued operations
  334. *flag = 1;
  335. cudaStreamSynchronize(stream0[i]);
  336. cudaStreamSynchronize(stream1[j]);
  337. cudaCheckError();
  338. float time_ms;
  339. cudaEventElapsedTime(&time_ms, start[i], stop[i]);
  340. double time_s = time_ms / 1e3;
  341. double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
  342. if (i == j) {
  343. gb *= 2; // must count both the read and the write here
  344. }
  345. bandwidthMatrix[i * numGPUs + j] = gb / time_s;
  346. if (p2p && access) {
  347. cudaSetDevice(i);
  348. cudaDeviceDisablePeerAccess(j);
  349. cudaSetDevice(j);
  350. cudaDeviceDisablePeerAccess(i);
  351. }
  352. }
  353. }
  354. printf(" D\\D");
  355. for (int j = 0; j < numGPUs; j++) {
  356. printf("%6d ", j);
  357. }
  358. printf("\n");
  359. for (int i = 0; i < numGPUs; i++) {
  360. printf("%6d ", i);
  361. for (int j = 0; j < numGPUs; j++) {
  362. printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
  363. }
  364. printf("\n");
  365. }
  366. for (int d = 0; d < numGPUs; d++) {
  367. cudaSetDevice(d);
  368. cudaFree(buffers[d]);
  369. cudaFree(buffersD2D[d]);
  370. cudaCheckError();
  371. cudaEventDestroy(start[d]);
  372. cudaCheckError();
  373. cudaEventDestroy(stop[d]);
  374. cudaCheckError();
  375. cudaStreamDestroy(stream0[d]);
  376. cudaCheckError();
  377. cudaStreamDestroy(stream1[d]);
  378. cudaCheckError();
  379. }
  380. cudaFreeHost((void *)flag);
  381. cudaCheckError();
  382. }
  383. void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  384. int repeat = 100;
  385. int numElems = 4; // perform 1-int4 transfer.
  386. volatile int *flag = NULL;
  387. StopWatchInterface *stopWatch = NULL;
  388. vector<int *> buffers(numGPUs);
  389. vector<int *> buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy
  390. vector<cudaStream_t> stream(numGPUs);
  391. vector<cudaEvent_t> start(numGPUs);
  392. vector<cudaEvent_t> stop(numGPUs);
  393. cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  394. cudaCheckError();
  395. if (!sdkCreateTimer(&stopWatch)) {
  396. printf("Failed to create stop watch\n");
  397. exit(EXIT_FAILURE);
  398. }
  399. sdkStartTimer(&stopWatch);
  400. for (int d = 0; d < numGPUs; d++) {
  401. cudaSetDevice(d);
  402. cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
  403. cudaMalloc(&buffers[d], sizeof(int) * numElems);
  404. cudaMemset(buffers[d], 0, sizeof(int) * numElems);
  405. cudaMalloc(&buffersD2D[d], sizeof(int) * numElems);
  406. cudaMemset(buffersD2D[d], 0, sizeof(int) * numElems);
  407. cudaCheckError();
  408. cudaEventCreate(&start[d]);
  409. cudaCheckError();
  410. cudaEventCreate(&stop[d]);
  411. cudaCheckError();
  412. }
  413. vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
  414. vector<double> cpuLatencyMatrix(numGPUs * numGPUs);
  415. for (int i = 0; i < numGPUs; i++) {
  416. cudaSetDevice(i);
  417. for (int j = 0; j < numGPUs; j++) {
  418. int access = 0;
  419. if (p2p) {
  420. cudaDeviceCanAccessPeer(&access, i, j);
  421. if (access) {
  422. cudaDeviceEnablePeerAccess(j, 0);
  423. cudaCheckError();
  424. cudaSetDevice(j);
  425. cudaDeviceEnablePeerAccess(i, 0);
  426. cudaSetDevice(i);
  427. cudaCheckError();
  428. }
  429. }
  430. cudaStreamSynchronize(stream[i]);
  431. cudaCheckError();
  432. // Block the stream until all the work is queued up
  433. // DANGER! - cudaMemcpy*Async may infinitely block waiting for
  434. // room to push the operation, so keep the number of repeatitions
  435. // relatively low. Higher repeatitions will cause the delay kernel
  436. // to timeout and lead to unstable results.
  437. *flag = 0;
  438. delay<<<1, 1, 0, stream[i]>>>(flag);
  439. cudaCheckError();
  440. cudaEventRecord(start[i], stream[i]);
  441. sdkResetTimer(&stopWatch);
  442. if (i == j) {
  443. // Perform intra-GPU, D2D copies
  444. performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
  445. access, stream[i]);
  446. } else {
  447. if (p2p_method == P2P_WRITE) {
  448. performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
  449. stream[i]);
  450. } else {
  451. performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
  452. stream[i]);
  453. }
  454. }
  455. float cpu_time_ms = sdkGetTimerValue(&stopWatch);
  456. cudaEventRecord(stop[i], stream[i]);
  457. // Now that the work has been queued up, release the stream
  458. *flag = 1;
  459. cudaStreamSynchronize(stream[i]);
  460. cudaCheckError();
  461. float gpu_time_ms;
  462. cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);
  463. gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
  464. cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
  465. if (p2p && access) {
  466. cudaDeviceDisablePeerAccess(j);
  467. cudaSetDevice(j);
  468. cudaDeviceDisablePeerAccess(i);
  469. cudaSetDevice(i);
  470. cudaCheckError();
  471. }
  472. }
  473. }
  474. printf(" GPU");
  475. for (int j = 0; j < numGPUs; j++) {
  476. printf("%6d ", j);
  477. }
  478. printf("\n");
  479. for (int i = 0; i < numGPUs; i++) {
  480. printf("%6d ", i);
  481. for (int j = 0; j < numGPUs; j++) {
  482. printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
  483. }
  484. printf("\n");
  485. }
  486. printf("\n CPU");
  487. for (int j = 0; j < numGPUs; j++) {
  488. printf("%6d ", j);
  489. }
  490. printf("\n");
  491. for (int i = 0; i < numGPUs; i++) {
  492. printf("%6d ", i);
  493. for (int j = 0; j < numGPUs; j++) {
  494. printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
  495. }
  496. printf("\n");
  497. }
  498. for (int d = 0; d < numGPUs; d++) {
  499. cudaSetDevice(d);
  500. cudaFree(buffers[d]);
  501. cudaFree(buffersD2D[d]);
  502. cudaCheckError();
  503. cudaEventDestroy(start[d]);
  504. cudaCheckError();
  505. cudaEventDestroy(stop[d]);
  506. cudaCheckError();
  507. cudaStreamDestroy(stream[d]);
  508. cudaCheckError();
  509. }
  510. sdkDeleteTimer(&stopWatch);
  511. cudaFreeHost((void *)flag);
  512. cudaCheckError();
  513. }
  514. int main(int argc, char **argv) {
  515. int numGPUs, numElems = 40000000;
  516. P2PDataTransfer p2p_method = P2P_WRITE;
  517. cudaGetDeviceCount(&numGPUs);
  518. cudaCheckError();
  519. // process command line args
  520. if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
  521. printHelp();
  522. return 0;
  523. }
  524. if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {
  525. p2p_method = P2P_READ;
  526. }
  527. if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {
  528. p2p_mechanism = SM;
  529. }
  530. // number of elements of int to be used in copy.
  531. if (checkCmdLineFlag(argc, (const char **)argv, "numElems")) {
  532. numElems = getCmdLineArgumentInt(argc, (const char **)argv, "numElems");
  533. }
  534. printf("[%s]\n", sSampleName);
  535. // output devices
  536. for (int i = 0; i < numGPUs; i++) {
  537. cudaDeviceProp prop;
  538. cudaGetDeviceProperties(&prop, i);
  539. cudaCheckError();
  540. printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,
  541. prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
  542. }
  543. checkP2Paccess(numGPUs);
  544. // Check peer-to-peer connectivity
  545. printf("P2P Connectivity Matrix\n");
  546. printf(" D\\D");
  547. for (int j = 0; j < numGPUs; j++) {
  548. printf("%6d", j);
  549. }
  550. printf("\n");
  551. for (int i = 0; i < numGPUs; i++) {
  552. printf("%6d\t", i);
  553. for (int j = 0; j < numGPUs; j++) {
  554. if (i != j) {
  555. int access;
  556. cudaDeviceCanAccessPeer(&access, i, j);
  557. cudaCheckError();
  558. printf("%6d", (access) ? 1 : 0);
  559. } else {
  560. printf("%6d", 1);
  561. }
  562. }
  563. printf("\n");
  564. }
  565. printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  566. outputBandwidthMatrix(numElems, numGPUs, false, P2P_WRITE);
  567. printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
  568. outputBandwidthMatrix(numElems, numGPUs, true, P2P_WRITE);
  569. if (p2p_method == P2P_READ) {
  570. printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
  571. outputBandwidthMatrix(numElems, numGPUs, true, p2p_method);
  572. }
  573. printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  574. outputBidirectionalBandwidthMatrix(numElems, numGPUs, false);
  575. printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
  576. outputBidirectionalBandwidthMatrix(numElems, numGPUs, true);
  577. printf("P2P=Disabled Latency Matrix (us)\n");
  578. outputLatencyMatrix(numGPUs, false, P2P_WRITE);
  579. printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
  580. outputLatencyMatrix(numGPUs, true, P2P_WRITE);
  581. if (p2p_method == P2P_READ) {
  582. printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
  583. outputLatencyMatrix(numGPUs, true, p2p_method);
  584. }
  585. printf(
  586. "\nNOTE: The CUDA Samples are not meant for performance measurements. "
  587. "Results may vary when GPU Boost is enabled.\n");
  588. exit(EXIT_SUCCESS);
  589. }