-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatMul.cu
194 lines (146 loc) · 4.56 KB
/
matMul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#include "stdio.h"
#include "stdlib.h"
#define BLOCKDIM 32
#define BLOCKDIMP BLOCKDIM+1
//naive/slow kernel
__global__ void multiplyMatrices1Naive(float *a, float *b, float *c, int N){
int xid = blockIdx.x*blockDim.x + threadIdx.x;
int yid = blockIdx.y*blockDim.x + threadIdx.y;
if (xid<N && yid<N){
float result = 0;
for (int i=0; i<N; i++){
result += a[xid*N+i]*b[i*N+yid];
}
c[xid*N+yid] = result;
}
return ;
}
//kernel will be faster as it uses shared memory
__global__ void multiplyMatrices2Shared(float *a, float *b, float *c, int N){
int xid = blockIdx.x*blockDim.x + threadIdx.x;
int yid = blockIdx.y*blockDim.y + threadIdx.y;
int xtid = threadIdx.x;
int ytid = threadIdx.y;
int nTiles = gridDim.x;
__shared__ float aS[BLOCKDIM][BLOCKDIM];
__shared__ float bS[BLOCKDIM][BLOCKDIM];
if (xid>=N || yid>=N) return;
float result = 0;
for (int k1=0; k1<nTiles; k1++)
{
int aRow = xid;
int bCol = yid;
int aCol = k1*blockDim.x + xtid;
int bRow = k1*blockDim.x + ytid;
//load tiles into shared memory
aS[xtid][ytid] = a[ aRow*N + aCol ];
bS[xtid][ytid] = b[ bRow*N + bCol ];
__syncthreads();
for (int k=0; k<blockDim.x; k++){
result += aS[xtid][k]*bS[k][ytid];
}
}
c[xid*N+yid] = result;
return ;
}
//kernel will be faster as it uses shared memory and
//pads shared memory to avoid bank conflict
__global__ void multiplyMatrices3Pad(float *a, float *b, float *c, int N){
int xid = blockIdx.x*blockDim.x + threadIdx.x;
int yid = blockIdx.y*blockDim.y + threadIdx.y;
int xtid = threadIdx.x;
int ytid = threadIdx.y;
int nTiles = gridDim.x;
__shared__ float aS[BLOCKDIM][BLOCKDIMP];
__shared__ float bS[BLOCKDIM][BLOCKDIMP];
if (xid>=N || yid>=N) return;
float result = 0;
for (int k1=0; k1<nTiles; k1++)
{
int aRow = xid;
int bCol = yid;
int aCol = k1*blockDim.x + xtid;
int bRow = k1*blockDim.x + ytid;
aS[xtid][ytid] = a[ aRow*N + aCol ];
bS[xtid][ytid] = b[ bRow*N + bCol ];
__syncthreads();
for (int k=0; k<blockDim.x; k++){
result += aS[xtid][k]*bS[k][ytid];
}
}
c[xid*N+yid] = result;
return ;
}
//kernel will be faster as it uses shared memory, avoids bank conflicts
//and allows for coalesced accesses in shared memory
__global__ void multiplyMatrices4Coalesce(float *a, float *b, float *c, int N){
int xid = blockIdx.x*blockDim.x + threadIdx.x;
int yid = blockIdx.y*blockDim.y + threadIdx.y;
int xtid = threadIdx.x;
int ytid = threadIdx.y;
int nTiles = gridDim.x;
__shared__ float aS[BLOCKDIM][BLOCKDIMP];
__shared__ float bS[BLOCKDIM][BLOCKDIMP];
if (xid>=N || yid>=N) return;
float result = 0;
for (int k1=0; k1<nTiles; k1++)
{
int aRow = xid;
int bCol = yid;
int aCol = k1*blockDim.x + xtid;
int bRow = k1*blockDim.x + ytid;
//transpose the b matrix into shared memory
//this allows for better accessing of shared memory
aS[xtid][ytid] = a[ aRow*N + aCol ];
bS[ytid][xtid] = b[ bRow*N + bCol ];
__syncthreads();
for (int k=0; k<blockDim.x; k++){
result += aS[xtid][k]*bS[ytid][k];
}
}
c[xid*N+yid] = result;
return ;
}
int main(){
int N =1024; //size of each dimension
int N2 = N*N; //size of total matrix
float *a_cpu = (float*) malloc(sizeof(float)*N2);
float *b_cpu = (float*) malloc(sizeof(float)*N2);
float *c_cpu = (float*) malloc(sizeof(float)*N2);
//let's fill em up with values, this can also be done on the gpu
for (int i=0; i<N2; i++)
{
a_cpu[i] = 1.0f;
b_cpu[i] = 1.0f;
}
//now we have to allocate memory on the gpu
float *a;
float *b;
float *c;
cudaMalloc( &a, sizeof(float)*N2);
cudaMalloc( &b, sizeof(float)*N2);
cudaMalloc( &c, sizeof(float)*N2);
//send the cpu memory to the gpu memory
cudaMemcpy(a, a_cpu, N2*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b, b_cpu, N2*sizeof(float), cudaMemcpyHostToDevice);
//always choose a multiple of 32 threads for blocksize
int blockSize = 32;
int nBlocks = (N + blockSize - 1 ) / blockSize;
dim3 grid (nBlocks, nBlocks);
dim3 block (blockSize, blockSize);
printf("blocks %i, threads per block %i\n", nBlocks, blockSize);
//launch your gpu kernel/function
for (int i=0; i<50; i++){
multiplyMatrices1Naive<<<grid, block>>>(a, b, c, N);
multiplyMatrices2Shared<<<grid, block>>>(a, b, c, N);
multiplyMatrices3Pad<<<grid, block>>>(a, b, c, N);
multiplyMatrices4Coalesce<<<grid, block>>>(a, b, c, N);
}
//now let's bring the result vector back the the cpu
cudaMemcpy(c_cpu, c, N2*sizeof(float), cudaMemcpyDeviceToHost);
printf("print out first 10 results, should all be 2\n");
for (int i=0; i<10; i++){
printf("result %i: %f\n", i, c_cpu[i]);
}
return 0;
}