vector_addition_gpu_thread_block.cu 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. // Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
  2. #include<stdio.h>
  3. #include<stdlib.h>
  4. #define N 512
  5. void host_add(int *a, int *b, int *c) {
  6. for(int idx=0;idx<N;idx++)
  7. c[idx] = a[idx] + b[idx];
  8. }
  9. __global__ void device_add(int *a, int *b, int *c) {
  10. int index = threadIdx.x + blockIdx.x * blockDim.x;
  11. c[index] = a[index] + b[index];
  12. }
  13. //basically just fills the array with index.
  14. void fill_array(int *data) {
  15. for(int idx=0;idx<N;idx++)
  16. data[idx] = idx;
  17. }
  18. void print_output(int *a, int *b, int*c) {
  19. for(int idx=0;idx<N;idx++)
  20. printf("\n %d + %d = %d", a[idx] , b[idx], c[idx]);
  21. }
  22. int main(void) {
  23. int *a, *b, *c;
  24. int *d_a, *d_b, *d_c; // device copies of a, b, c
  25. int threads_per_block=0, no_of_blocks=0;
  26. int size = N * sizeof(int);
  27. // Alloc space for host copies of a, b, c and setup input values
  28. a = (int *)malloc(size); fill_array(a);
  29. b = (int *)malloc(size); fill_array(b);
  30. c = (int *)malloc(size);
  31. // Alloc space for device copies of a, b, c
  32. cudaMalloc((void **)&d_a, size);
  33. cudaMalloc((void **)&d_b, size);
  34. cudaMalloc((void **)&d_c, size);
  35. // Copy inputs to device
  36. cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  37. cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  38. threads_per_block = 4;
  39. no_of_blocks = N/threads_per_block;
  40. device_add<<<no_of_blocks,threads_per_block>>>(d_a,d_b,d_c);
  41. // Copy result back to host
  42. cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
  43. print_output(a,b,c);
  44. free(a); free(b); free(c);
  45. cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
  46. return 0;
  47. }