cuda_matmul.cu 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <iostream>
  4. #include <math.h>
  5. #include <time.h>
  6. //#define VERIFY
  7. //uncomment above to print difference between CPU and GPU calculations
  8. __global__ void matmul_kernel(
  9. const float* M1,
  10. const float* M2,
  11. float* M3,
  12. const int m,
  13. const int n,
  14. const int p
  15. )
  16. {
  17. /*
  18. CUDA kernel for matrix multiplication M3 = M1 * M2
  19. This function will be executed by every CUDA thread
  20. The instructions are the same, but each thread will work
  21. on a separate chunk of the data, as specified by the array indices.
  22. Note that the kernel definition is preceded by the __global__
  23. qualifier. Further, the kernel function returns nothing (void)
  24. Thus, we must modify the output matrix M3 within this function.
  25. The changes made to M3 (or M1 and M2) will all be visible outside
  26. the kernel to CPU and GPU memory after the kernel has executed.
  27. */
  28. //Get the x and y indices of output entry for this thread
  29. int i = blockIdx.y * blockDim.y + threadIdx.y;
  30. int j = blockIdx.x * blockDim.x + threadIdx.x;
  31. /*
  32. Wait! what are blockDim, blockIdx and threadIdx??
  33. These are structs provided by CUDA, which tells the thread
  34. how many blocks have been launched, what block number does
  35. the current thread reside in and finally, what is the x and y
  36. index of the current thread within the block.
  37. These variables allow each thread to choose which sub-section
  38. of the A, B and C matrices it should work on and we use them next.
  39. */
  40. if ((i>=m)||(j>=p))
  41. {
  42. return;
  43. //this just means that dont process anything outside the
  44. //bounds of the output matrix size
  45. }
  46. float cout=0.0;
  47. //this is a local variable we have defined within the thread
  48. //so, this variable will reside in register memory as explained earlier
  49. for (int k=0; k<n; k++)
  50. {
  51. cout += M1[i*n + k]*M2[k*p + j];
  52. //loop through elements of one row of M1 and
  53. //one column of M2, multiply corresponding elements
  54. //and add them up. We are just doing standard matrix
  55. //multiplication.
  56. }
  57. M3[i*p+j] = cout;
  58. //here we modify M3
  59. }
  60. int main(int argc, char* argv[])
  61. {
  62. /*
  63. In this demo, we will create matrices of size
  64. A: M x N
  65. B: N x P
  66. C: M x P <-- for GPU
  67. D: M x P <-- for CPU
  68. We will initialize A, B, C, D and perform matrix multiplications:
  69. C = A*B (on GPU)
  70. D = A*B (on CPU)
  71. */
  72. if (argc != 4)
  73. {
  74. printf("Matrix multiplication example for A[MxN] and B[NxP]\nUsage: cu_mm.out M N P\n");
  75. exit(1);
  76. }
  77. int M=atoi(argv[1]); //2049;
  78. int N=atoi(argv[2]); //257;
  79. int P=atoi(argv[3]); //512;
  80. float *A, *B, *C, *D;
  81. /*
  82. Let's use unified memory
  83. cudaMallocManaged allows us to allocate memory
  84. once and use it across both CPU and GPU.
  85. */
  86. cudaMallocManaged(&A, M*N*sizeof(float));//input Mat1
  87. cudaMallocManaged(&B, N*P*sizeof(float));//input Mat2
  88. cudaMallocManaged(&C, M*P*sizeof(float));//output Mat for GPU
  89. cudaMallocManaged(&D, M*P*sizeof(float));//output Mat for CPU
  90. //we will do matmul in both CPU and GPU and compare the execution times
  91. for (int i=0; i<M*N; i++)
  92. {
  93. A[i]=sin((float)i/100);
  94. //init with sine of index, just as an example
  95. }
  96. for (int i=0; i<N*P; i++)
  97. {
  98. B[i]=cos((float)i/100);
  99. //init with sine of index, just as an example
  100. }
  101. //C and D can be left uninitialized
  102. float elapsed_time_gpu=0.0;
  103. double elapsed_time_cpu=0.0;
  104. cudaEvent_t gpu_start, gpu_stop;
  105. struct timespec cpu_start, cpu_stop;
  106. //BEGIN GPU MATMUL
  107. dim3 blocks_per_grid(ceil(M/32),ceil(P/32));
  108. dim3 threads_per_block(32, 32);
  109. /*
  110. We use CUDA events to accurately measure the time taken by matmul op
  111. Refer to page 16 of CUDA C++ Best Practices Guide:
  112. https://docs.nvidia.com/cuda/pdf/CUDA_C_Best_Practices_Guide.pdf
  113. */
  114. cudaEventCreate(&gpu_start);
  115. cudaEventCreate(&gpu_stop);
  116. cudaEventRecord(gpu_start, 0);
  117. matmul_kernel<<<blocks_per_grid, threads_per_block>>>(A, B, C, M, N, P);
  118. cudaEventRecord(gpu_stop, 0);
  119. cudaEventSynchronize(gpu_stop);
  120. //END GPU MATMUL
  121. timespec_get(&cpu_start, TIME_UTC);
  122. //BEGIN CPU MATMUL
  123. for (int i=0; i<M; i++)
  124. {
  125. for (int j=0; j< P; j++)
  126. {
  127. float cout=0.0;
  128. for(int k=0; k<N; k++)
  129. {
  130. cout+=A[i*N+k]*B[k*P+j];
  131. }
  132. D[i*P+j]=cout;
  133. }
  134. }
  135. //END CPU MATMUL
  136. timespec_get(&cpu_stop, TIME_UTC);
  137. //Measure elapsed times
  138. cudaEventElapsedTime(&elapsed_time_gpu, gpu_start, gpu_stop);
  139. elapsed_time_cpu = ((double)(cpu_stop.tv_sec - cpu_start.tv_sec)) * 1000000 + ((double)(cpu_stop.tv_nsec - cpu_start.tv_nsec)) / 1000;
  140. //tv_nsec is in nanoseconds
  141. /*
  142. Define VERIFY above to print diffs for the
  143. first 100 entries
  144. you will get all values very close to zero
  145. */
  146. #ifdef VERIFY
  147. for (int i=0; i<100; i++)
  148. {
  149. float diff=C[i]-D[i];
  150. printf("%f, ", diff);
  151. }
  152. printf("\n");
  153. #endif
  154. //convert microseconds to milliseconds
  155. printf("Elapsed time (CPU)= %f milliseconds\n", elapsed_time_cpu/1000);
  156. printf("Elapsed time (GPU)= %f milliseconds\n", elapsed_time_gpu);
  157. //cudaEventElapsedTime reports time in milliseconds
  158. cudaFree(A);
  159. cudaFree(B);
  160. cudaFree(C);
  161. cudaFree(D);
  162. }