1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
| #include <bits/stdc++.h>
using namespace std;
__global__ void matmul(float *A, float *B, float *C, int N) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
if(tx < N && ty < N) {
float c = 0;
for(int i = 0; i < N; ++i) c += A[N * tx + i] * B[N * i + ty];
C[N * tx + ty] = c;
}
}
int main() {
int N = 1 << 8;
float *A, *B, *C;
cudaMallocManaged(&A, N * N * sizeof(float));
cudaMallocManaged(&B, N * N * sizeof(float));
cudaMallocManaged(&C, N * N * sizeof(float));
for(int i = 0; i < N; ++i) {
for(int j = 0; j < N; ++j) {
A[N * i + j] = 1.0 * rand() / RAND_MAX;
B[N * i + j] = 1.0 * rand() / RAND_MAX;
C[N * i + j] = 0;
}
}
dim3 numBlocks(16, 16);
dim3 blockSize(16, 16);
matmul<<<numBlocks, blockSize>>>(A, B, C, N);
cudaDeviceSynchronize();
float maxError = 0.0f;
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; ++j) {
float c = 0;
for(int k = 0; k < N; ++k) c += A[N * i + k] * B[N * k + j];
maxError = fmax(maxError, fabs(C[N * i + j] - c));
}
}
std::cout << "Max error: " << maxError << std::endl;
cudaFree(A), cudaFree(B), cudaFree(C);
return 0;
}
|
Comments