CUDA矩阵BLAS效率

923723914

浏览: 636835 次

最近访客更多访客>>

rattersnake

u012363178

jiefengwen

wanghaojava

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (1376)

社区版块

存档分类

以前曾听说cublas的效率不是很高,今天写了个小程序对cublas的矩阵乘法速度进行了一个测试,发现结果并非如此.至少就矩阵乘法来说，cublas的效率很不错,相对CPU有非常高的加速比.
测试程序是在sdk例子simpleCUBLAS的基础上修改而成,测试内容是分别用cublas和CPU函数计算两个N阶矩阵A和B的乘积,然后对结果进行校验，并计算各自的flops.(每个矩阵元素加乘看作2flops)
测试所使用的显卡为GTX295,CPU是i7920,均为默认频率.
显然,矩阵乘法的速度受问题尺度影响很大,因此具体测试了N=256-4096的情况.结果在N=256时,cublas的速度就已经达到了CPU函数的300余倍,而在N=4096时,cublas的速度超过了CPU的2000倍,此时的实测浮点速度达到了约360Gflops.
当然，这里面CPU矩阵乘法的算法过于简单，并没有做什么优化.但是即使经过充分优化，CPU的速度也只能提高1个数量级左右，仍然和cublas差距巨大.

测试具体结果:
N=256,GPU=0.000180s(186.121Gflops),CPU=0.061939s(0.542Gflops)
N=512,GPU=0.000934s(287.515Gflops),CPU=0.498737s(0.538Gflops)
N=1024,GPU=0.006252s(343.471Gflops),CPU=8.553767s(0.251Gflops)
N=2048,GPU=0.048440s(354.666Gflops),CPU=85.726814s(0.200Gflops)
N=3072,GPU=0.161870s(358.201Gflops),CPU=292.890472s(0.198Gflops)
N=4096,GPU=0.383253s(358.612Gflops),CPU=799.222351s(0.172Gflops)

具体的测试程序代码:

#include "cutil_inline.h"
#include "cublas.h"

#define N 1024

void simple_sgemm(const float *A, const float *B, float *C) {
    int i, j, k;
    for(i=0; i<N; i++)
    for(j=0; j<N; j++) {
        float s=0;
        for(k=0; k<N; k++) s+=A[k*N+i]*B[j*N+k];
        C[j*N+i]=s;
    }
}

int main() {    
    float *h_A=(float*)malloc(N*N*sizeof(float));
    float *h_B=(float*)malloc(N*N*sizeof(float));
    float *h_C=(float*)malloc(N*N*sizeof(float));
    float *h_C_ref=(float*)malloc(N*N*sizeof(float));
    float *d_A, *d_B, *d_C;
    unsigned int timer1=0;
    cutCreateTimer(&timer1);
    cutStartTimer(timer1);
    printf("simpleCUBLAS test running..\n");
    cublasInit();
    for(int i=0; i<N*N; i++) {
        h_A[i]=rand()/(float)RAND_MAX;
        h_B[i]=rand()/(float)RAND_MAX;
    }
    cublasAlloc(N*N, sizeof(float), (void**)&d_A);
    cublasAlloc(N*N, sizeof(float), (void**)&d_B);
    cublasAlloc(N*N, sizeof(float), (void**)&d_C);
    cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
    cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
    float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
    cudaThreadSynchronize();
    t0=cutGetTimerValue(timer1);
    cublasSgemm('n', 'n', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);
    cudaThreadSynchronize();
    gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);
    t0=cutGetTimerValue(timer1);
    simple_sgemm(h_A, h_B, h_C_ref);
    cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n", 
        N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
    for(int i=0; i<N*N; i++) {
        float diff=h_C_ref[i]-h_C[i];
        error_norm+=diff*diff;
        ref_norm+=h_C_ref[i]*h_C_ref[i];
    }
    printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}

评价cublas的效率不是和cpu比,该是看是否是gpu上的最高...呵呵.

我想，cublas的以上测试成绩在gpu程序中也算得上高的了。从峰值性能来说，GTX295的单GPU峰值浮点性能有两种说法,894GFlops和596GFlops，分别对应于1.242*240*3和1.242*240*2，个人感觉后者更现实一些。按此计算，cublas358Gflops的实测值达到了峰值浮点性能的60%，就GPU而言，这应该十个不错的成绩。
当然，比cublas效率更高的程序也并非不存在，但一般人自己写应该是困难的。今天测了一下NVIDIACUDAProgrammingGuide2.2.1p23-p25中的使用sharedmemory的矩阵乘法程序，只能达到cublas测试成绩的1/10-1/2左右。具体结果如下：

N=256,dt=0.001697s(19.769Gflops)
N=512,dt=0.003801s(70.615Gflops)
N=1024,dt=0.015581s(137.830Gflops)
N=2048,dt=0.101421s(169.392Gflops)
N=3072,dt=0.334208s(173.491Gflops)
N=4096,dt=0.895309s(153.510Gflops)

网上见过一些文章说cublas效率比自己写的程序还慢，但无法进行验证，如果有人能提供相关测试程序再好不过了。

附今天的测试程序:

#include "cutil_inline.h"

typedef struct {
	int width;
	int height;
	int stride;
	float* elements;
} Matrix;

#define BLOCK_SIZE	16
#define N			3072

__device__ float GetElement(const Matrix A, int row, int col) {
	return A.elements[row * A.stride + col];
}

__device__ void SetElement(Matrix A, int row, int col, float value) {
	A.elements[row * A.stride + col] = value;
}

__device__ Matrix GetSubMatrix(Matrix A, int row, int col) {
	Matrix Asub;
	Asub.width = BLOCK_SIZE;
	Asub.height = BLOCK_SIZE;
	Asub.stride = A.stride;
	Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
	return Asub;
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) {
	int blockRow = blockIdx.y;
	int blockCol = blockIdx.x;
	Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
	float Cvalue = 0;
	int row = threadIdx.y;
	int col = threadIdx.x;
	for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
		Matrix Asub = GetSubMatrix(A, blockRow, m);
		Matrix Bsub = GetSubMatrix(B, m, blockCol);
		__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
		__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
		As[row][col] = GetElement(Asub, row, col);
		Bs[row][col] = GetElement(Bsub, row, col);
		__syncthreads();
		for (int e = 0; e < BLOCK_SIZE; ++e)
			Cvalue += As[row][e] * Bs[e][col];
		__syncthreads();
	}
	SetElement(Csub, row, col, Cvalue);
}

void MatMul(const Matrix A, const Matrix B, Matrix C) {
	Matrix d_A;
	d_A.width = d_A.stride = A.width; d_A.height = A.height;
	size_t size = A.width * A.height * sizeof(float);
	cudaMalloc((void**)&d_A.elements, size);
	cudaMemcpy(d_A.elements, A.elements, size,
	cudaMemcpyHostToDevice);
	Matrix d_B;
	d_B.width = d_B.stride = B.width; d_B.height = B.height;
	size = B.width * B.height * sizeof(float);
	cudaMalloc((void**)&d_B.elements, size);
	cudaMemcpy(d_B.elements, B.elements, size,
	cudaMemcpyHostToDevice);
	Matrix d_C;
	d_C.width = d_C.stride = C.width; d_C.height = C.height;
	size = C.width * C.height * sizeof(float);
	cudaMalloc((void**)&d_C.elements, size);
	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
	MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
	cudaMemcpy(C.elements, d_C.elements, size,
	cudaMemcpyDeviceToHost);
	cudaFree(d_A.elements);
	cudaFree(d_B.elements);
	cudaFree(d_C.elements);
}

void init(Matrix *A) {
	A->width=A->height=A->stride=N;
	A->elements=(float*) calloc(N*N, sizeof(float));
	for(int i=0; i<N*N; i++) A->elements[i]=i*0.1f;
}

void simple_sgemm(const float *A, const float *B, float *C) {
    int i, j, k;
    for(i=0; i<N; i++)
    for(j=0; j<N; j++) {
        float s=0;
        for(k=0; k<N; k++) s+=A[k*N+i]*B[j*N+k];
        C[j*N+i]=s;
    }
}

int main() {
	Matrix A, B, C, C_ref;
	init(&A);
	init(&B);
	init(&C);
	init(&C_ref);
	unsigned int t=0;
	cutCreateTimer(&t);
	cutStartTimer(t);
	double t0, dt;
	t0=cutGetTimerValue(t);
	for(int ii=0; ii<100; ii++) MatMul(A, B, C);
	dt=(cutGetTimerValue(t)-t0)/1000.0/100.0;
	printf("N=%d, dt=%fs(%.3fGflops)\n", N, dt, (1e-9*N*N*N*2)/dt);
	simple_sgemm(A.elements, B.elements, C_ref.elements);
	for(int i=0; i<N*N; i++) {
		float err=fabs(C_ref.elements[i]-C.elements[i]);
		if(err>1e-5 && err>C.elements[i]*1e-5) printf("ERR!\n"), exit(-1);
	}
	printf("GOOD!\n");
}

用百度搜了下，找到一篇论文也是讨论cublas的矩阵乘法效率的。这篇文章中使用的是8800GT的显卡，矩阵乘法部分测试了两个尺寸为(2560x1536)和(1536x4096)的矩阵相乘，得出的结论是cublas和自己编程的实测性能分别为530GFlops和444GFlops。不过这个数值似乎有些偏大，根据文章中给出的程序用时303ms和435ms,按照我的理解应该是106Gflops和74Gflops，正好和文章给出的数值差5倍和6倍，也许文章是把1次矩阵元素相乘计作10flops和12flops了，我觉得还是计作2flops比较合理。
这篇论文的地址是:www.ecice06.com/qikan/manage/wenzhang/091001.pdf。
再补充一个双精度的测试结果，只需把测试程序中的float和cublasSgemm改成double和cublasDgemm即可。双精度cublas矩阵乘法的性能基本上是单精度的1/3-1/5。

N=256,GPU=0.000603s(55.677Gflops)
N=512,GPU=0.004517s(59.428Gflops)
N=1024,GPU=0.030556s(70.280Gflops)
N=2048,GPU=0.239756s(71.656Gflops)
N=3072,GPU=0.802976s(72.209Gflops)
N=4096,GPU=1.901021s(72.297Gflops)

近日查阅了一些资料，证实了矩阵乘法效率不佳的只是cublas1.x，而cublas2.x已是顶级性能。此外，有报道称经充分优化后目前4核的CPU矩阵乘法也能达到几十GFlops的实测性能，如属实，CUDA的加速比就不是那么强大了。

GTX295单个GPU双精度的理论峰值性能是1.242*30*2=74.52Gflops，cublas在4096x4096时的实测值72.297Gflops已经很接近峰值了，绝对效率达到了97%。

单精度的实测值离峰值还有一些距离,据说主要原因是使用sharedmem时算术指令的速度会下降。此外，应该还有一个原因就是目前cublas中的代码是针对g80优化的，并没有考虑g200的dual-issue特性，如果能充分利用这一点也许效率还会有提升。

就矩阵乘法而言，目前CPU和GPU的差距并不大。CPU上的高性能矩阵相乘非常复杂，每个CPU厂商都有相应的数学库。例如INTELMKL的BLAS。其峰值速度接近于最好的GPU结果。
目前显卡的性能提升明显慢于CPU，再考虑到将矩阵传送到显卡的时间。就矩阵相乘而言，我更看好多核CPU+大容量内存。
Nvidia自己做些例子用来吹牛就算了，包括一些研究人员在性能对比测试上，往往采用一些未经优化的代码，以突出GPU的优势，相当不严谨。

的确,双精度的矩阵乘法目前CPU和GPU确差距不大,四核xeon利用imkl最高可以达到70GFlops/s左右,和GTX295的一个GPU实测性能相当.

不过单精度差距还是比较大的,目前CUDA的最好实测成绩是510GFlops/GPU左右(GTX285),而一块GTX295的两个GPU加起来可以达到800GFlops以上,比四核xeon的理论峰值还高一个数量级.

但目前GPU双精度差主要是构架原因,如果CPU的性能还是保持过去发展速度的话，等fermi出来后CPU和GPU的双精度差距可能也会大幅拉大.

关于研究人员的对比数据,的确存在CPU代码不够优化的例子.但是其中至少有部分并不是研究人员不想优化,而是他们尽量对CPU和GPU代码进行了优化，最后只能达到这样的水平(CPU和GPU都并非最优化,但CPU代码更差)。我想也许是CPU厂商在CPU优化方面宣传力度不够，很多科研人员不了解如何优化吧。(以矩阵乘法为例,简单使用sse+openmp很难接近imkl的成绩,有数量级的差距)

blas在一开始设计时就考虑到了这个问题，只需修改第1和第2个参数即可满足不同的row/columnmajor存储方式。用'T','T'参数应该就可满足你的要求。修改后的代码如下：

#include "cutil_inline.h"
#include "cublas.h"

#define N 512

void simple_sgemm(const float A[N][N], const float B[N][N], float C[N][N]) {
    int i, j, k;
    for(j=0; j<N; j++)
    for(i=0; i<N; i++) {
        float s=0;
        for(k=0; k<N; k++) s+=A[i][k]*B[k][j];
        C[j][i]=s;
    }
}

float h_A[N][N], h_B[N][N], h_C[N][N], h_C_ref[N][N];

int main() {    
    float *d_A, *d_B, *d_C;
    unsigned int timer1=0;
    cutCreateTimer(&timer1);
    cutStartTimer(timer1);
    printf("simpleCUBLAS test running..\n");
    cublasInit();
	for(int j=0; j<N; j++)
    for(int i=0; i<N; i++) {
        h_A[j][i]=rand()/(float)RAND_MAX;
        h_B[j][i]=rand()/(float)RAND_MAX;
    }
    cublasAlloc(N*N, sizeof(float), (void**)&d_A);
    cublasAlloc(N*N, sizeof(float), (void**)&d_B);
    cublasAlloc(N*N, sizeof(float), (void**)&d_C);
    cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
    cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
    float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
    cudaThreadSynchronize();
    t0=cutGetTimerValue(timer1);
    cublasSgemm('T', 'T', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);
    cudaThreadSynchronize();
    gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);
    t0=cutGetTimerValue(timer1);
    simple_sgemm(h_A, h_B, h_C_ref);
    cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n", 
        N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
    for(int j=0; j<N; j++)
	for(int i=0; i<N; i++) {
        float diff=h_C_ref[j][i]-h_C[j][i];
        error_norm+=diff*diff;
        ref_norm+=h_C_ref[j][i]*h_C_ref[j][i];
    }
    printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}

抱歉，我上面的方法可能还不大好，因为其中的simple_sgemm中实际上还是隐含着进行了一次转置。
实际上行先列先对应于矩阵转置。一个更简单的办法是利用以下矩阵转置公式，在调用cublas时用'n','n'参数同时交换A,B的顺序，这样得到的结果就不需要转置了.

C=A*B->C'=B'*A'(这里'代表转置)

代码如下：

#include "cutil_inline.h"
#include "cublas.h"

#define N 512

void simple_sgemm(const float A[N][N], const float B[N][N], float C[N][N]) {
    int i, j, k;
    for(i=0; i<N; i++)
    for(j=0; j<N; j++) {
        float s=0;
        for(k=0; k<N; k++) s+=A[i][k]*B[k][j];
        C[i][j]=s;
    }
}

float h_A[N][N], h_B[N][N], h_C[N][N], h_C_ref[N][N];

int main() {    
    float *d_A, *d_B, *d_C;
    unsigned int timer1=0;
    cutCreateTimer(&timer1);
    cutStartTimer(timer1);
    printf("simpleCUBLAS test running..\n");
    cublasInit();
    for(int i=0; i<N; i++)
    for(int j=0; j<N; j++) {
        h_A[i][j]=rand()/(float)RAND_MAX;
        h_B[i][j]=rand()/(float)RAND_MAX;
    }
    cublasAlloc(N*N, sizeof(float), (void**)&d_A);
    cublasAlloc(N*N, sizeof(float), (void**)&d_B);
    cublasAlloc(N*N, sizeof(float), (void**)&d_C);
    cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
    cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
    float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
    cudaThreadSynchronize();
    t0=cutGetTimerValue(timer1);
    cublasSgemm('n', 'n', N, N, N, 1.0f, d_B, N, d_A, N, 0.0f, d_C, N);
    cudaThreadSynchronize();
    gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);shuo
    t0=cutGetTimerValue(timer1);
    simple_sgemm(h_A, h_B, h_C_ref);
    cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
    printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n", 
        N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
    for(int i=0; i<N; i++)
    for(int j=0; j<N; j++) {
        float diff=h_C_ref[i][j]-h_C[i][j];
        error_norm+=diff*diff;
        ref_norm+=h_C_ref[i][j]*h_C_ref[i][j];
    }
    printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}

http://bbs.csdn.net/topics/330105294

分享到：