Cuda编程——使用share memory优化矩阵乘法

发布时间：2024年01月14日

在上一篇文章：第一个Cuda程序，矩阵相乘代码，我们设计了一种并行的矩阵乘法程序，效果和使用CPU计算的一样，但时间有了很大的降低，然而，这只是最基本的一种方法，事实上我们完全可以让程序变得更快！

仔细看看，会发现我们使用的是global memory，而share memory的访问速度要远远大于global memory，所以我们将使用share memory优化矩阵乘法，让程序更快！

#include <stdio.h>#include<stdlib.h>#include <cuda.h>#include <cuda_runtime.h>#include <device_launch_parameters.h>#include <device_functions.h>
# define BLOCK_SIZE 8# define M 6# define N 8# define K 6

__managed__ float a[M * N];__managed__ float b[N*K];__managed__ float c_gpu[M * K];__managed__ float c_cpu[M * K];


__global__ void gpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  __shared__ float sub_a[BLOCK_SIZE][BLOCK_SIZE];  __shared__ float sub_b[BLOCK_SIZE][BLOCK_SIZE];  int x = threadIdx.x + blockIdx.x * blockDim.x;  int y = threadIdx.y + blockIdx.y * blockDim.y;  float temp = 0.0;  int step, i;

  for (step = 0; step <N / BLOCK_SIZE; step++)  {    if ((step * BLOCK_SIZE + threadIdx.x) >= N || y >= M)    {      sub_a[threadIdx.y][threadIdx.x] = 0.0;    }    else    {      sub_a[threadIdx.y][threadIdx.x] = a[y * N + (step * BLOCK_SIZE + threadIdx.x)];    }
    if ((step * BLOCK_SIZE + threadIdx.y) >= N || x >= K)    {      sub_b[threadIdx.y][threadIdx.x] = 0.0;
    }    else    {      sub_b[threadIdx.y][threadIdx.x] = b[(step * BLOCK_SIZE + threadIdx.y) * K + x];    }

    __syncthreads();

    for (i = 0; i < BLOCK_SIZE; i++)    {      temp = temp + sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];    }    __syncthreads();
    if (x < K && y < M)    {      c[y * K + x] = temp;    }

  }


}
void cpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  int y, x, step;  float temp;  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      temp = 0;      for (step = 0; step < N; step++)      {        temp += a[y * N + step] * b[step * K + x];      }      c[y * K + x] = temp;    }  }}
int main(){  int x,y;  float item1;  //初始化矩阵  for (y = 0; y < M; y++)  {

    for (x = 0; x < N; x++)    {      item1 = x + y;      a[y * N + x] = item1;
    }
  }



  for (y = 0; y < N; y++)  {

    for (x = 0; x < K; x++)    {      item1 = x + y;      b[y * K + x] = item1;
    }  }  printf("-----------------两个矩阵-#--------------\n");

  for (y = 0; y < M; y++)  {
    for (x = 0; x < N; x++)    {      printf("%f ", a[y * N + x]);    }    printf("\n");  }

  for (y = 0; y < N; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", b[y * K + x]);    }    printf("\n");  }


  unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;  unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
  dim3 dimGrid(grid_rows, grid_cols);  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  gpu_matrix <<<dimGrid, dimBlock>>> (a, b, c_gpu, M, N, K);
  cudaDeviceSynchronize();
  cpu_matrix(a, b, c_cpu, M, N, K);  //




  //打印cpu计算结果  printf("---------------cpu计算结果---------#------------------\n");
  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_cpu[y * K + x]);    }    printf("\n");
  }
  printf("------------------gpu计算结果------#------------------\n");

  //打印GPU计算结果  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_gpu[y * K + x]);    }    printf("\n");  }

  return 0;}

-----------------两个矩阵-#--------------0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.0000001.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.0000002.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.0000003.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.0000004.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.0000005.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.0000000.000000  1.000000  2.000000  3.000000  4.000000  5.0000001.000000  2.000000  3.000000  4.000000  5.000000  6.0000002.000000  3.000000  4.000000  5.000000  6.000000  7.0000003.000000  4.000000  5.000000  6.000000  7.000000  8.0000004.000000  5.000000  6.000000  7.000000  8.000000  9.0000005.000000  6.000000  7.000000  8.000000  9.000000  10.0000006.000000  7.000000  8.000000  9.000000  10.000000  11.0000007.000000  8.000000  9.000000  10.000000  11.000000  12.000000---------------cpu计算结果---------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000------------------gpu计算结果------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000

到了这里，我们能够使得矩阵乘法变得相当快（与仅使用CPU计算相比），这在实际应用中非常重要，尤其是数据计算量非常大的情况。

也许到了这里，这两个程序你并没有完全了解，但，不要担心，先把这些代码运行一下，体会使用GPU计算的魅力，为以后的学习打下基础。

文章来源:https://blog.csdn.net/m0_57569438/article/details/135585462
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：chenni525@qq.com进行投诉反馈，一经查实，立即删除！