-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatmulf8.cu
46 lines (40 loc) · 1.62 KB
/
matmulf8.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// Copyright (C) 2024 Chunqing Shan
//
// float8_matmul is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// float8_matmul is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with float8_matmul. If not, see <http://www.gnu.org/licenses/>.
#include <cstdio>
#include <cuda_runtime.h>
#include "matmulf8_kernel.cuh"
float matmul(int* A, int* B, int* C, int n, int m, int p) {
int* d_A, *d_B, *d_C;
cudaMalloc(&d_A, n * m / 4 * sizeof(int));
cudaMalloc(&d_B, m * p / 4 * sizeof(int));
cudaMalloc(&d_C, n * p / 4 * sizeof(int));
cudaMemcpy(d_A, A, n * m * sizeof(int) / 4, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, m * p * sizeof(int) / 4, cudaMemcpyHostToDevice);
float t;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// Each kernel thread do 4xf8 result
matmulf8<<<dim3(n / 32, p / 32), dim3(32, 8)>>>(d_A, d_B, d_C, n, m, p);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&t, start, stop);
cudaMemcpy(C, d_C, n * p * sizeof(int) / 4, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return t;
}