Implementation of Matrix multiplication using CUDA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/*
 * This is a program to implmenent matrix multiplication using CUDA
 * This program uses massively parallel threads to calculate resultant element
 */

#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>

#define N 1024

__global__ void matmul(int *a, int *b, int *c)
{
 int row = blockIdx.y * blockDim.y + threadIdx.y;  //Find row number of the resultant element
 int col = blockIdx.x * blockDim.x + threadIdx.x;  //Find column number of the resultant element

 for(int k = 0;k<N;k++)
 {
  c[row * N + col] += a[row * N + k] * b[k*N + col];
 }

}
int main()
{
 int *a, *b, *c;
 int *deva, *devb, *devc;
 int i, j;
 int size = sizeof(int) * N * N;

 unsigned int start_time = 0;
 start_time = clock();

 //allocate memory in host
 a = (int *)malloc(size);
 b = (int *)malloc(size);
 c = (int *)malloc(size);

 //allocate memory in CUDA (device) memory
 cudaMalloc((void **)&deva, size);
 cudaMalloc((void **)&devb, size);
 cudaMalloc((void **)&devc, size);

 //puting some values in memory for computation
 for(i=0;i<N;i++)
 {
  for(j=0;j<N;j++)
  {
   a[i*N + j] = b[i*N +j] = 1;
   c[i*N+j] = 0;
  }
 }

 //copy host memory data in CUDA (device) memory

 cudaMemcpy(deva, a, size, cudaMemcpyHostToDevice);
 cudaMemcpy(devb, b, size, cudaMemcpyHostToDevice);
 cudaMemcpy(devc, c, size, cudaMemcpyHostToDevice);

 // launch a kernel for doing matrix multiplication
 dim3 blocksize(N/16, N/16);
 dim3 threadsize(16, 16);

 matmul<<<blocksize, threadsize>>>(deva, devb, devc);

 //copy the result back into host memory
 cudaMemcpy(c, devc, size, cudaMemcpyDeviceToHost);

 //Lets see the execution time
 printf("\nExecution time of %d x %d matrix multiplication: %ld seconds ", N, N, (clock()-start_time)/CLOCKS_PER_SEC);
 
 //print the result
 for(i=0;i<N;i++)
 {
  printf("\n");
  for(j=0;j<N;j++)
  {
   printf("%d ", c[i*N + j]);
  }
 }
 return 0;
}

No comments: