19 #include <cuda_runtime_api.h>
27 #include "magma_lapack.h"
32 #if defined(PRECISION_z) || defined(PRECISION_c)
33 #define FLOPS(m, n) ( 6.*FMULS_GEQRF(m, n) + 2.*FADDS_GEQRF(m, n) )
35 #define FLOPS(m, n) ( FMULS_GEQRF(m, n) + FADDS_GEQRF(m, n) )
85 volatile cuDoubleComplex **p;
118 long int t = (
long int) tp->
tid;
123 cuDoubleComplex *WORK;
127 while (mp->
sync0 == 0) {
131 for (i = 0; i < mp->
np_gpu; i++)
134 while (mp->
p[i] == NULL) {
142 if (i == (mp->
np_gpu - 1)) {
147 WORK = (cuDoubleComplex*)malloc(
sizeof(cuDoubleComplex)*M*N);
150 &M,&N,&K,mp->
a+i*mp->
nb*mp->
lda+i*mp->
nb,&(mp->
lda),mp->
t+i*mp->
nb*mp->
nb,&K,
158 while (mp->
sync2 == 0) {
174 if (qr_params->
nb == -1)
177 if (qr_params->
ob == -1)
178 qr_params->
ob = qr_params->
nb;
180 if (qr_params->
fb == -1)
181 qr_params->
fb = qr_params->
nb;
183 if (qr_params->
ob * qr_params->
nthreads >= n){
184 fprintf(stderr,
"\n\nNumber of threads times block size not less than width of matrix.\n\n");
190 if ( (n-(qr_params->
nthreads * qr_params->
ob)) % qr_params->
nb != 0)
197 qr_params->
t = (cuDoubleComplex*)malloc(
sizeof(cuDoubleComplex)*
201 if ((qr_params->
n-(qr_params->
nthreads*qr_params->
ob)) > qr_params->
m) {
202 qr_params->
np_gpu = m/qr_params->
nb;
203 if (m%qr_params->
nb != 0)
207 fprintf(stderr,
"qr_params->np_gpu=%d\n",qr_params->
np_gpu);
209 qr_params->
p = (
volatile cuDoubleComplex **) malloc (
sizeof(cuDoubleComplex*)*
212 for (i = 0; i < qr_params->
np_gpu; i++)
213 qr_params->
p[i] = NULL;
215 qr_params->
sync0 = 1;
217 qr_params->
w = (cuDoubleComplex *)malloc(
sizeof(cuDoubleComplex)*
223 qr_params->
sync2 = 0;
242 cuDoubleComplex *h_A, *h_R, *h_work, *tau;
243 double gpu_perf, cpu_perf, flops;
251 magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};
273 for(i = 1; i<argc; i++){
274 if (strcmp(
"-N", argv[i])==0)
276 else if (strcmp(
"-M", argv[i])==0)
278 else if (strcmp(
"-F", argv[i])==0)
279 mp->
fb = atoi(argv[++i]);
280 else if (strcmp(
"-O", argv[i])==0)
281 mp->
ob = atoi(argv[++i]);
282 else if (strcmp(
"-B", argv[i])==0)
283 mp->
nb = atoi(argv[++i]);
284 else if (strcmp(
"-b", argv[i])==0)
285 mp->
ib = atoi(argv[++i]);
286 else if (strcmp(
"-A", argv[i])==0)
287 accuracyflag = atoi(argv[++i]);
288 else if (strcmp(
"-P", argv[i])==0)
289 nthreads = atoi(argv[++i]);
290 else if (strcmp(
"-Q", argv[i])==0)
291 nquarkthreads = atoi(argv[++i]);
292 else if (strcmp(
"-nc", argv[i])==0)
293 nc = atoi(argv[++i]);
294 else if (strcmp(
"-ncps", argv[i])==0)
295 ncps = atoi(argv[++i]);
298 if ((M>0 && N>0) || (M==0 && N==0))
300 printf(
" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N);
308 printf(
"\nUsage: \n");
309 printf(
" Make sure you set the number of BLAS threads to 1, e.g.,\n");
310 printf(
" > setenv MKL_NUM_THREADS 1\n");
311 printf(
" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024);
317 printf(
"\nUsage: \n");
318 printf(
" Make sure you set the number of BLAS threads to 1, e.g.,\n");
319 printf(
" > setenv MKL_NUM_THREADS 1\n");
320 printf(
" Set number of cores per socket and number of cores.\n");
321 printf(
" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024);
322 printf(
" Alternatively, set:\n");
323 printf(
" Q: Number of threads for panel factorization.\n");
324 printf(
" P: Number of threads for trailing matrix update (CPU).\n");
325 printf(
" B: Block size.\n");
326 printf(
" b: Inner block size.\n");
327 printf(
" O: Block size for trailing matrix update (CPU).\n");
328 printf(
" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112);
333 if ((nc > 0) && (ncps > 0)) {
335 #if (defined(PRECISION_d))
338 #if (defined(PRECISION_c))
341 #if (defined(PRECISION_z))
345 auto_tune(
'q', precision, nc, ncps, M, N,
346 &(mp->
nb), &(mp->
ob), &(mp->
ib), &nthreads, &nquarkthreads);
348 fprintf(stderr,
"%d %d %d %d %d\n",mp->
nb,mp->
ob,mp->
ib,nquarkthreads,nthreads);
357 context->
params = (
void *)(mp);
361 for (i = 0; i < nthreads; i++)
376 printf(
" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n");
377 printf(
"==========================================================\n");
380 M = N = min_mn = size[i];
384 flops =
FLOPS( (
double)M, (
double)N ) / 1000000;
401 magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info);
410 if (accuracyflag == 1)
414 printf(
"Argument %d of zgeqrf had an illegal value.\n", -info);
416 cpu_perf = 4.*M*N*min_mn/(3.*1000000*
GetTimerValue(start,end));
421 double work[1], matnorm = 1.;
425 if (accuracyflag == 1){
430 if (accuracyflag == 1){
431 printf(
"%5d %5d %6.2f %6.2f %e\n",
432 M, N, cpu_perf, gpu_perf,
435 printf(
"%5d %5d %6.2f \n",