-
Notifications
You must be signed in to change notification settings - Fork 377
Open
Description
文中提到了改写bank.cu第15-20行的代码后矩阵转置就变成了矩阵复制,但我的理解下改写后的代码仍然可以实现矩阵转置,同时我测试了两个转置函数的输出结果,发现两者都可以实现转置,函数2并不是对矩阵进行复制
这是我的代码:
转置矩阵1,按照bank.cu中的实现
__global__ void transpose_1(const float* A, float* B, int N)
{
__shared__ float S[16][16];
int bx = blockIdx.x * 16;
int by = blockIdx.y * 16;
int x = bx + threadIdx.x;
int y = by + threadIdx.y;
if (x < N && y < N)
S[threadIdx.y][threadIdx.x] = A[y * N + x];
__syncthreads();
int nx = bx + threadIdx.y;
int ny = by + threadIdx.x;
if (nx < N && ny < N)
B[nx * N + ny] = S[threadIdx.x][threadIdx.y];
}
转置矩阵2,按照文中所说的进行改写
{
__shared__ float S[16][16];
int bx = blockIdx.x * 16;
int by = blockIdx.y * 16;
int x = bx + threadIdx.x;
int y = by + threadIdx.y;
if (x < N && y < N)
S[threadIdx.y][threadIdx.x] = A[y * N + x];
__syncthreads();
int nx = bx + threadIdx.x;
int ny = by + threadIdx.y;
if (nx < N && ny < N)
B[nx * N + ny] = S[threadIdx.y][threadIdx.x];
}
主函数
int main() {
const int N = 200;
// 生成随机矩阵
float h_A[N * N] = { 0 };
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
h_A[i * N + j] = (float)rand() / RAND_MAX;
float h_B_1[N * N] = { 0 };
float h_B_2[N * N] = { 0 };
float* d_A, * d_B;
cudaMalloc(&d_A, sizeof(h_A));
cudaMalloc(&d_B, sizeof(h_B_1));
// Copy input matrix to device
cudaMemcpy(d_A, h_A, sizeof(h_A), cudaMemcpyHostToDevice);
dim3 block(16, 16);
dim3 grid((N + 15) / 16, (N + 15) / 16);
// Run 1 version
transpose_1 << <grid, block >> > (d_A, d_B, N);
cudaMemcpy(h_B_1, d_B, sizeof(h_B_1), cudaMemcpyDeviceToHost);
// Run 2 version
transpose_2 << <grid, block >> > (d_A, d_B, N);
cudaMemcpy(h_B_2, d_B, sizeof(h_B_2), cudaMemcpyDeviceToHost);
//// Output
//print_matrix(h_A, N, "Input A");
//print_matrix(h_B_1, N, "B from transpose_1");
//print_matrix(h_B_2, N, "B from transpose_2");
// Check results
bool passed = true;
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
if (h_B_1[j * N + i] != h_B_2[j * N + i]) {
passed = false;
printf("Error: B_1[%d][%d] = %f, B_2[%d][%d] = %f\n", i, j, h_B_1[j * N + i], j, i, h_B_2[j * N + i]);
break;
}
if (passed)
printf("Test passed!\n");
cudaFree(d_A);
cudaFree(d_B);
return 0;
}
两个函数的输出结果是完全一样的,都实现了转置
Metadata
Metadata
Assignees
Labels
No labels
