Skip to content

P89 bank.cu的疑问 #46

@Henry0528

Description

@Henry0528

文中提到了改写bank.cu第15-20行的代码后矩阵转置就变成了矩阵复制,但我的理解下改写后的代码仍然可以实现矩阵转置,同时我测试了两个转置函数的输出结果,发现两者都可以实现转置,函数2并不是对矩阵进行复制
这是我的代码:
转置矩阵1,按照bank.cu中的实现

__global__ void transpose_1(const float* A, float* B, int N)
{
    __shared__ float S[16][16];

    int bx = blockIdx.x * 16;
    int by = blockIdx.y * 16;

    int x = bx + threadIdx.x;
    int y = by + threadIdx.y;

    if (x < N && y < N)
        S[threadIdx.y][threadIdx.x] = A[y * N + x];

    __syncthreads();

    int nx = bx + threadIdx.y;
    int ny = by + threadIdx.x;

    if (nx < N && ny < N)
        B[nx * N + ny] = S[threadIdx.x][threadIdx.y];
}

转置矩阵2,按照文中所说的进行改写

{
    __shared__ float S[16][16];

    int bx = blockIdx.x * 16;
    int by = blockIdx.y * 16;

    int x = bx + threadIdx.x;
    int y = by + threadIdx.y;

    if (x < N && y < N)
        S[threadIdx.y][threadIdx.x] = A[y * N + x];

    __syncthreads();

    int nx = bx + threadIdx.x;
    int ny = by + threadIdx.y;

    if (nx < N && ny < N)
        B[nx * N + ny] = S[threadIdx.y][threadIdx.x];
}

主函数

int main() {
    const int N = 200;
    // 生成随机矩阵
    float h_A[N * N] = { 0 };
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            h_A[i * N + j] = (float)rand() / RAND_MAX;

    float h_B_1[N * N] = { 0 };
    float h_B_2[N * N] = { 0 };

    float* d_A, * d_B;
    cudaMalloc(&d_A, sizeof(h_A));
    cudaMalloc(&d_B, sizeof(h_B_1));

    // Copy input matrix to device
    cudaMemcpy(d_A, h_A, sizeof(h_A), cudaMemcpyHostToDevice);

    dim3 block(16, 16);
    dim3 grid((N + 15) / 16, (N + 15) / 16);

    // Run 1 version
    transpose_1 << <grid, block >> > (d_A, d_B, N);
    cudaMemcpy(h_B_1, d_B, sizeof(h_B_1), cudaMemcpyDeviceToHost);

    // Run 2 version
    transpose_2 << <grid, block >> > (d_A, d_B, N);
    cudaMemcpy(h_B_2, d_B, sizeof(h_B_2), cudaMemcpyDeviceToHost);

    //// Output
    //print_matrix(h_A, N, "Input A");
    //print_matrix(h_B_1, N, "B from transpose_1");
    //print_matrix(h_B_2, N, "B from transpose_2");
    // Check results
    bool passed = true;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            if (h_B_1[j * N + i] != h_B_2[j * N + i]) {
                passed = false;
                printf("Error: B_1[%d][%d] = %f, B_2[%d][%d] = %f\n", i, j, h_B_1[j * N + i], j, i, h_B_2[j * N + i]);
                break;
            }
    if (passed)
        printf("Test passed!\n");

    cudaFree(d_A);
    cudaFree(d_B);
    return 0;
}

两个函数的输出结果是完全一样的,都实现了转置

Image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions