Stan Tomov wrote: The use is as in testing_dsytrd.cpp
I could not find this file in the testing directory.
However I ran some test on the dsytrd subroutine using the following code:
- Code: Select all
* @the testing_dgetrf source code has been used as a template.
*
**/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
// includes, project
#include "flops.h"
#include "magma.h"
#include "testings.h"
// Flops formula
#define PRECISION_d
#define FLOPS(n) ( FMULS_SYTRD(n) + FADDS_SYTRD(n) )
/* ////////////////////////////////////////////////////////////////////////////
-- Testing dsytrd
*/
int main( int argc, char** argv)
{
TESTING_CUDA_INIT();
TimeStruct start, end;
double flops, gpu_perf, cpu_perf, error;
double *h_A, *h_R;
double *h_DA, *h_EA, *h_DR, *h_ER;
double *h_TAUA, *h_TAUR;
double *h_WORKA, *h_WORKR;
double etime_cpu, etime_gpu;
cublasStatus status;
double *d_work;
int LWORKA, LWORKR;
char uplo='L';
/* Matrix size */
magma_int_t N = 0, n2, lda;
magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};
magma_int_t i, info, min_mn, nb;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
if (argc != 1){
for(i = 1; i<argc; i++){
if (strcmp("-N", argv[i])==0)
N = atoi(argv[++i]);
}
if (N>0)
printf(" testing_dsytrd -N %d\n\n", N);
else
{
printf("\nUsage: \n");
printf(" testing_dsytrd -N %d\n\n", 1024);
exit(1);
}
}
else {
printf("\nUsage: \n");
printf(" testing_dsytrd_gpu -N %d\n\n", 1024);
N = size[9];
}
n2 = N * N;
min_mn = N;
nb = magma_get_dsytrd_nb(N);
/* Allocate host memory for the matrix */
TESTING_MALLOC( h_A, double, n2 );
TESTING_HOSTALLOC( h_R, double, n2 );
/* Allocate memory for the tridiagonal factorization result */
/* Diagonal of the tridiagonal matrices */
TESTING_MALLOC(h_DA, double, N);
TESTING_MALLOC(h_DR, double, N);
/* Off-diagonal */
TESTING_MALLOC(h_EA, double, N-1);
TESTING_MALLOC(h_ER, double, N-1);
/* scalar factors */
TESTING_MALLOC(h_TAUA, double, N-1);
TESTING_MALLOC(h_TAUR, double, N-1);
/* workspaces */
LWORKA=N*nb;
TESTING_MALLOC(h_WORKA, double, LWORKA);
LWORKR=N*nb;
TESTING_MALLOC(h_WORKR, double, LWORKR);
printf("\n\n");
printf(" M N CPU GFlop/s GPU GFlop/s ||PA-LU||/(||A||*N)\n");
printf("============================================================\n");
for(i=0; i<10; i++){
if (argc == 1){
N = size[i];
}
min_mn= N;
lda = N;
n2 = lda*N;
flops = FLOPS( (double)N ) / 1000000;
/* Allocate memory on the GPU */
status = cublasAlloc(N*lda+2*N*nb, sizeof(double), (void**)&d_work);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (magma_dsytrd)\n");
return 0;
}
/* Initialize the matrices */
dlarnv_( &ione, ISEED, &n2, h_A );
dlacpy_( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
/* =====================================================================
Performs operation using LAPACK
=================================================================== */
start = get_current_time();
dsytrd_(&uplo, &N, h_A, &lda, h_DA, h_EA, h_TAUA, h_WORKA, &LWORKA, &info);
end = get_current_time();
if (info < 0)
printf("Argument %d of dsytrd had an illegal value.\n", -info);
etime_cpu = GetTimerValue(start, end);
cpu_perf = flops / etime_cpu;
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
start = get_current_time();
magma_dsytrd(uplo, N, h_R, lda, h_DR, h_ER, h_TAUR, h_WORKR, &LWORKR, d_work, &info);
end = get_current_time();
if (info < 0)
printf("Argument %d of dsytrd had an illegal value.\n", -info);
etime_gpu = GetTimerValue(start, end);
gpu_perf = flops / etime_gpu;
cublasFree(d_work);
/* =====================================================================
Check the factorization (TODO)
=================================================================== */
error = 0.0;
printf("%5d %5d %6.2f %6.2f %6.2f %6.2f %e\n",
N, N, cpu_perf, etime_cpu, gpu_perf, etime_gpu, error);
if (argc != 1)
break;
}
/* Memory clean up */
TESTING_FREE( h_A );
TESTING_HOSTFREE( h_R );
TESTING_FREE( h_DA );
TESTING_FREE( h_DR );
TESTING_FREE( h_EA );
TESTING_FREE( h_ER );
TESTING_FREE( h_TAUA );
TESTING_FREE( h_TAUR );
TESTING_FREE( h_WORKA );
TESTING_FREE( h_WORKR );
/* Shutdown */
TESTING_CUDA_FINALIZE();
}
The measured performance of the above code (GFlops and Elapsed Time in ms) on a Tesla M2050 and a Xeon X5650@2.67GHz follows
(the multithreaded MKL 10.2 library has been used throughout)
- Code: Select all
MKL_NUM_THREADS=1
M N CPU GFlop/s CPU etime GPU GFlop/s GPU etime
==============================================================
1024 1024 2.55 563.33 7.27 197.51
2048 2048 6.89 1664.27 12.80 895.77
3072 3072 5.91 6546.29 15.97 2423.07
4032 4032 5.77 15163.73 17.20 5084.59
5184 5184 6.16 30185.97 19.04 9761.71
6016 6016 6.32 45922.65 19.64 14787.11
MKL_NUM_THREADS=2
M N CPU GFlop/s CPU etime GPU GFlop/s GPU etime
============================================================
1024 1024 7.18 199.84 10.58 135.64
2048 2048 14.06 815.36 13.51 848.70
3072 3072 13.35 2897.68 16.22 2384.71
4032 4032 12.32 7100.76 17.05 5130.04
5184 5184 12.17 15275.06 18.87 9847.06
6016 6016 12.29 23641.25 19.61 14808.18
8064 8064 11.04 63346.21 19.69 35525.21
MKL_NUM_THREADS=4
M N CPU GFlop/s CPU etime GPU GFlop/s GPU etime
============================================================
1024 1024 12.23 117.38 13.30 107.96
2048 2048 25.61 447.73 13.08 876.94
3072 3072 23.21 1666.62 16.54 2339.50
4032 4032 19.07 4586.34 17.15 5100.60
5184 5184 16.84 11038.70 19.11 9725.92
6016 6016 16.11 18023.51 19.70 14741.81
7040 7040 15.52 29980.80 20.14 23107.40
MKL_NUM_THREADS=6
M N CPU GFlop/s CPU etime GPU GFlop/s GPU etime
============================================================
1024 1024 13.50 106.30 18.53 77.47
2048 2048 30.59 374.86 13.54 847.26
3072 3072 28.96 1335.93 16.27 2377.54
4032 4032 22.31 3920.85 16.98 5148.98
5184 5184 19.73 9418.17 18.76 9904.61
6016 6016 19.07 15231.36 19.58 14831.17
7040 7040 18.26 25488.31 20.08 23171.77
As you can see I could not be able to reproduce the speed up reported in the paper "Accelerating the reduction to upper Hessenberg,
tridiagonal, and bidiagonal forms through hybrid GPU-based computing" (in this paper an asymptotic 80 GFlops for the GPU/Magma is shown in fig. 6 with the GPU constantly outperforming the CPU/MKL) .
Clearly I am doing something wrong. Any suggestions?
edit: Actually, the 80Gflops refers to the single precision tridiagonalization routine. So it is possible that the double precision version has much lower preformance. Still, it seems rather odd to me that the CPU multi-threads dsytrd routine can match (when not outperforms...) the GPU performance of magma_dsytrd.