123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- #include <stdio.h>
- #include <stdlib.h>
- #include <iostream>
- #include <math.h>
- #include <time.h>
- //#define VERIFY
- //uncomment above to print difference between CPU and GPU calculations
- __global__ void matmul_kernel(
- const float* M1,
- const float* M2,
- float* M3,
- const int m,
- const int n,
- const int p
- )
- {
- /*
- CUDA kernel for matrix multiplication M3 = M1 * M2
- This function will be executed by every CUDA thread
- The instructions are the same, but each thread will work
- on a separate chunk of the data, as specified by the array indices.
- Note that the kernel definition is preceded by the __global__
- qualifier. Further, the kernel function returns nothing (void)
- Thus, we must modify the output matrix M3 within this function.
- The changes made to M3 (or M1 and M2) will all be visible outside
- the kernel to CPU and GPU memory after the kernel has executed.
- */
- //Get the x and y indices of output entry for this thread
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- int j = blockIdx.x * blockDim.x + threadIdx.x;
-
- /*
- Wait! what are blockDim, blockIdx and threadIdx??
- These are structs provided by CUDA, which tells the thread
- how many blocks have been launched, what block number does
- the current thread reside in and finally, what is the x and y
- index of the current thread within the block.
- These variables allow each thread to choose which sub-section
- of the A, B and C matrices it should work on and we use them next.
- */
- if ((i>=m)||(j>=p))
- {
- return;
- //this just means that dont process anything outside the
- //bounds of the output matrix size
- }
- float cout=0.0;
- //this is a local variable we have defined within the thread
- //so, this variable will reside in register memory as explained earlier
-
- for (int k=0; k<n; k++)
- {
- cout += M1[i*n + k]*M2[k*p + j];
- //loop through elements of one row of M1 and
- //one column of M2, multiply corresponding elements
- //and add them up. We are just doing standard matrix
- //multiplication.
- }
-
- M3[i*p+j] = cout;
- //here we modify M3
- }
- int main(int argc, char* argv[])
- {
- /*
- In this demo, we will create matrices of size
- A: M x N
- B: N x P
- C: M x P <-- for GPU
- D: M x P <-- for CPU
-
- We will initialize A, B, C, D and perform matrix multiplications:
- C = A*B (on GPU)
- D = A*B (on CPU)
- */
- if (argc != 4)
- {
- printf("Matrix multiplication example for A[MxN] and B[NxP]\nUsage: cu_mm.out M N P\n");
- exit(1);
- }
- int M=atoi(argv[1]); //2049;
- int N=atoi(argv[2]); //257;
- int P=atoi(argv[3]); //512;
- float *A, *B, *C, *D;
- /*
- Let's use unified memory
- cudaMallocManaged allows us to allocate memory
- once and use it across both CPU and GPU.
- */
- cudaMallocManaged(&A, M*N*sizeof(float));//input Mat1
- cudaMallocManaged(&B, N*P*sizeof(float));//input Mat2
-
- cudaMallocManaged(&C, M*P*sizeof(float));//output Mat for GPU
- cudaMallocManaged(&D, M*P*sizeof(float));//output Mat for CPU
- //we will do matmul in both CPU and GPU and compare the execution times
- for (int i=0; i<M*N; i++)
- {
- A[i]=sin((float)i/100);
- //init with sine of index, just as an example
- }
- for (int i=0; i<N*P; i++)
- {
- B[i]=cos((float)i/100);
- //init with sine of index, just as an example
- }
- //C and D can be left uninitialized
- float elapsed_time_gpu=0.0;
- double elapsed_time_cpu=0.0;
- cudaEvent_t gpu_start, gpu_stop;
- struct timespec cpu_start, cpu_stop;
- //BEGIN GPU MATMUL
- dim3 blocks_per_grid(ceil(M/32),ceil(P/32));
- dim3 threads_per_block(32, 32);
- /*
- We use CUDA events to accurately measure the time taken by matmul op
- Refer to page 16 of CUDA C++ Best Practices Guide:
- https://docs.nvidia.com/cuda/pdf/CUDA_C_Best_Practices_Guide.pdf
- */
- cudaEventCreate(&gpu_start);
- cudaEventCreate(&gpu_stop);
- cudaEventRecord(gpu_start, 0);
- matmul_kernel<<<blocks_per_grid, threads_per_block>>>(A, B, C, M, N, P);
- cudaEventRecord(gpu_stop, 0);
- cudaEventSynchronize(gpu_stop);
- //END GPU MATMUL
- timespec_get(&cpu_start, TIME_UTC);
- //BEGIN CPU MATMUL
- for (int i=0; i<M; i++)
- {
- for (int j=0; j< P; j++)
- {
- float cout=0.0;
- for(int k=0; k<N; k++)
- {
- cout+=A[i*N+k]*B[k*P+j];
- }
- D[i*P+j]=cout;
- }
- }
- //END CPU MATMUL
- timespec_get(&cpu_stop, TIME_UTC);
-
- //Measure elapsed times
- cudaEventElapsedTime(&elapsed_time_gpu, gpu_start, gpu_stop);
- elapsed_time_cpu = ((double)(cpu_stop.tv_sec - cpu_start.tv_sec)) * 1000000 + ((double)(cpu_stop.tv_nsec - cpu_start.tv_nsec)) / 1000;
- //tv_nsec is in nanoseconds
- /*
- Define VERIFY above to print diffs for the
- first 100 entries
- you will get all values very close to zero
- */
- #ifdef VERIFY
- for (int i=0; i<100; i++)
- {
- float diff=C[i]-D[i];
- printf("%f, ", diff);
- }
- printf("\n");
- #endif
- //convert microseconds to milliseconds
- printf("Elapsed time (CPU)= %f milliseconds\n", elapsed_time_cpu/1000);
- printf("Elapsed time (GPU)= %f milliseconds\n", elapsed_time_gpu);
- //cudaEventElapsedTime reports time in milliseconds
-
- cudaFree(A);
- cudaFree(B);
- cudaFree(C);
- cudaFree(D);
- }
|