MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
testing_zgeqrf-v2.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @precisions normal z -> s d c
9 
10 */
11 
12 /* Includes, system */
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <math.h>
17 
18 #include <cuda.h>
19 #include <cuda_runtime_api.h>
20 #include <cublas.h>
21 
22 #include <quark.h>
23 
24 /* Includes, project */
25 #include "flops.h"
26 #include "magma.h"
27 #include "magma_lapack.h"
28 #include "testings.h"
29 
30 // Flops formula
31 #define PRECISION_z
32 #if defined(PRECISION_z) || defined(PRECISION_c)
33 #define FLOPS(m, n) ( 6.*FMULS_GEQRF(m, n) + 2.*FADDS_GEQRF(m, n) )
34 #else
35 #define FLOPS(m, n) ( FMULS_GEQRF(m, n) + FADDS_GEQRF(m, n) )
36 #endif
37 
38 #include <pthread.h>
39 
40 #define PRECISION_z
41 
42 
43 /* ------------------------------------------------------------
44  * MAGMA QR params
45  * --------------------------------------------------------- */
46 typedef struct {
47 
48  /* Whether or not to restore upper part of matrix */
50 
51  /* Number of MAGMA threads */
53 
54  /* Block size for left side of matrix */
56 
57  /* Block size for right side of matrix */
59 
60  /* Block size for final factorization */
62 
63  /* Block size for multi-core factorization */
65 
66  /* Number of panels for left side of matrix */
68 
69  /* Number of rows */
71 
72  /* Number of columns */
74 
75  /* Leading dimension */
77 
78  /* Matrix to be factorized */
79  cuDoubleComplex *a;
80 
81  /* Storage for every T */
82  cuDoubleComplex *t;
83 
84  /* Flags to wake up MAGMA threads */
85  volatile cuDoubleComplex **p;
86 
87  /* Synchronization flag */
88  volatile magma_int_t sync0;
89 
90  /* One synchronization flag for each MAGMA thread */
91  volatile magma_int_t *sync1;
92 
93  /* Synchronization flag */
94  volatile magma_int_t sync2;
95 
96  /* Work space */
97  cuDoubleComplex *w;
98 
100 
101 //magma_qr_params MG;
102 
103 typedef struct {
105  void *params;
106 } t_params;
107 
108 
109 /* Update thread */
110 extern "C" void *cpu_thread(void *a)
111 {
112  magma_int_t i;
113  t_params *tp = (t_params*)a;
114 
116 
117  //long magma_int_t t = (long int) a;
118  long int t = (long int) tp->tid;
119 
120  magma_int_t M;
121  magma_int_t N;
122  magma_int_t K;
123  cuDoubleComplex *WORK;
124 
125 loop:
126 
127  while (mp->sync0 == 0) {
128  sched_yield();
129  }
130 
131  for (i = 0; i < mp->np_gpu; i++)
132  {
133 
134  while (mp->p[i] == NULL) {
135  sched_yield();
136  }
137 
138  M=mp->m-i*mp->nb;
139  N=mp->ob;
140  K=mp->nb;
141  if (mp->m >= (mp->n-(mp->nthreads*mp->ob))) {
142  if (i == (mp->np_gpu - 1)) {
143  K = mp->n-mp->nthreads*mp->ob-(mp->np_gpu-1)*mp->nb;
144  }
145  }
146 
147  WORK = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)*M*N);
148 
150  &M,&N,&K,mp->a+i*mp->nb*mp->lda+i*mp->nb,&(mp->lda),mp->t+i*mp->nb*mp->nb,&K,
151  mp->a+mp->m*mp->n-(mp->nthreads-t)*mp->ob*mp->lda+i*mp->nb,&(mp->lda),WORK,&N);
152 
153  free(WORK);
154  }
155 
156  mp->sync1[t] = 1;
157 
158  while (mp->sync2 == 0) {
159  sched_yield();
160  }
161 
162 goto loop;
163 
164  return (void*)NULL;
165 }
166 
168  magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t nthreads)
169 {
170  magma_int_t i;
171 
172  qr_params->nthreads = nthreads;
173 
174  if (qr_params->nb == -1)
175  qr_params->nb = 128;
176 
177  if (qr_params->ob == -1)
178  qr_params->ob = qr_params->nb;
179 
180  if (qr_params->fb == -1)
181  qr_params->fb = qr_params->nb;
182 
183  if (qr_params->ob * qr_params->nthreads >= n){
184  fprintf(stderr,"\n\nNumber of threads times block size not less than width of matrix.\n\n");
185  exit(1);
186  }
187 
188  qr_params->np_gpu = (n-(qr_params->nthreads * qr_params->ob)) / qr_params->nb;
189 
190  if ( (n-(qr_params->nthreads * qr_params->ob)) % qr_params->nb != 0)
191  qr_params->np_gpu++;
192 
193  qr_params->m = m;
194  qr_params->n = n;
195  qr_params->lda = m;
196  qr_params->a = a;
197  qr_params->t = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)*
198  qr_params->n*
199  qr_params->nb);
200 
201  if ((qr_params->n-(qr_params->nthreads*qr_params->ob)) > qr_params->m) {
202  qr_params->np_gpu = m/qr_params->nb;
203  if (m%qr_params->nb != 0)
204  qr_params->np_gpu++;
205  }
206 
207  fprintf(stderr,"qr_params->np_gpu=%d\n",qr_params->np_gpu);
208 
209  qr_params->p = (volatile cuDoubleComplex **) malloc (sizeof(cuDoubleComplex*)*
210  qr_params->np_gpu);
211 
212  for (i = 0; i < qr_params->np_gpu; i++)
213  qr_params->p[i] = NULL;
214 
215  qr_params->sync0 = 1;
216 
217  qr_params->w = (cuDoubleComplex *)malloc(sizeof(cuDoubleComplex)*
218  qr_params->np_gpu*qr_params->nb*
219  qr_params->nb);
220 
221  qr_params->flag = 0;
222 
223  qr_params->sync2 = 0;
224 
225 }
226 
227 int TRACE;
228 
229 
230 /* ////////////////////////////////////////////////////////////////////////////
231  -- Testing zgeqrf
232 */
233 int main( magma_int_t argc, char** argv)
234 {
235  magma_int_t nquarkthreads=2;
236  magma_int_t nthreads=2;
237  magma_int_t num_gpus = 1;
238  TRACE = 0;
239 
240  //magma_qr_params mp;
241 
242  cuDoubleComplex *h_A, *h_R, *h_work, *tau;
243  double gpu_perf, cpu_perf, flops;
244 
245  magma_timestr_t start, end;
246 
247  magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params));
248 
249  /* Matrix size */
250  magma_int_t M=0, N=0, n2;
251  magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};
252 
253  cublasStatus status;
254  magma_int_t i, j, info;
255  magma_int_t ione = 1;
256  magma_int_t ISEED[4] = {0,0,0,1};
257 
258  mp->nb=-1;
259  mp->ob=-1;
260  mp->fb=-1;
261  mp->ib=32;
262 
263  magma_int_t loop = argc;
264  magma_int_t accuracyflag = 1;
265 
266  char precision;
267 
268  magma_int_t nc = -1;
269  magma_int_t ncps = -1;
270 
271  if (argc != 1)
272  {
273  for(i = 1; i<argc; i++){
274  if (strcmp("-N", argv[i])==0)
275  N = atoi(argv[++i]);
276  else if (strcmp("-M", argv[i])==0)
277  M = atoi(argv[++i]);
278  else if (strcmp("-F", argv[i])==0)
279  mp->fb = atoi(argv[++i]);
280  else if (strcmp("-O", argv[i])==0)
281  mp->ob = atoi(argv[++i]);
282  else if (strcmp("-B", argv[i])==0)
283  mp->nb = atoi(argv[++i]);
284  else if (strcmp("-b", argv[i])==0)
285  mp->ib = atoi(argv[++i]);
286  else if (strcmp("-A", argv[i])==0)
287  accuracyflag = atoi(argv[++i]);
288  else if (strcmp("-P", argv[i])==0)
289  nthreads = atoi(argv[++i]);
290  else if (strcmp("-Q", argv[i])==0)
291  nquarkthreads = atoi(argv[++i]);
292  else if (strcmp("-nc", argv[i])==0)
293  nc = atoi(argv[++i]);
294  else if (strcmp("-ncps", argv[i])==0)
295  ncps = atoi(argv[++i]);
296  }
297 
298  if ((M>0 && N>0) || (M==0 && N==0))
299  {
300  printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N);
301  if (M==0 && N==0) {
302  M = N = size[9];
303  loop = 1;
304  }
305  }
306  else
307  {
308  printf("\nUsage: \n");
309  printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n");
310  printf(" > setenv MKL_NUM_THREADS 1\n");
311  printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024);
312  exit(1);
313  }
314  }
315  else
316  {
317  printf("\nUsage: \n");
318  printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n");
319  printf(" > setenv MKL_NUM_THREADS 1\n");
320  printf(" Set number of cores per socket and number of cores.\n");
321  printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024);
322  printf(" Alternatively, set:\n");
323  printf(" Q: Number of threads for panel factorization.\n");
324  printf(" P: Number of threads for trailing matrix update (CPU).\n");
325  printf(" B: Block size.\n");
326  printf(" b: Inner block size.\n");
327  printf(" O: Block size for trailing matrix update (CPU).\n");
328  printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112);
329  M = N = size[9];
330  }
331 
332  /* Auto tune based on number of cores and number of cores per socket if provided */
333  if ((nc > 0) && (ncps > 0)) {
334  precision = 's';
335  #if (defined(PRECISION_d))
336  precision = 'd';
337  #endif
338  #if (defined(PRECISION_c))
339  precision = 'c';
340  #endif
341  #if (defined(PRECISION_z))
342  precision = 'z';
343  #endif
344 
345  auto_tune('q', precision, nc, ncps, M, N,
346  &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads);
347 
348 fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads);
349 
350  }
351 
352  /* Initialize MAGMA hardware context, seeting how many CPU cores
353  and how many GPUs to be used in the consequent computations */
354  mp->sync0 = 0;
356  context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv);
357  context->params = (void *)(mp);
358 
359  mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads);
360 
361  for (i = 0; i < nthreads; i++)
362  mp->sync1[i] = 0;
363 
364  n2 = M * N;
365  magma_int_t min_mn = min(M, N);
366  magma_int_t nb = magma_get_zgeqrf_nb(min_mn);
367  magma_int_t lwork = N*nb;
368 
369  /* Allocate host memory for the matrix */
370  TESTING_MALLOC ( h_A , cuDoubleComplex, n2 );
371  TESTING_MALLOC ( tau , cuDoubleComplex, min_mn);
372  TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 );
373  TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork );
374 
375  printf("\n\n");
376  printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n");
377  printf("==========================================================\n");
378  for(i=0; i<10; i++){
379  if (loop==1){
380  M = N = min_mn = size[i];
381  n2 = M*N;
382  }
383 
384  flops = FLOPS( (double)M, (double)N ) / 1000000;
385 
386  /* Initialize the matrix */
387  lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
388  lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M );
389 
390  //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info);
391 
392  for(j=0; j<n2; j++)
393  h_R[j] = h_A[j];
394 
395  /* ====================================================================
396  Performs operation using MAGMA
397  =================================================================== */
398  magma_qr_init(mp, M, N, h_R, nthreads);
399 
400  start = get_current_time();
401  magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info);
402  end = get_current_time();
403 
404  gpu_perf = flops / GetTimerValue(start, end);
405 
406  /* =====================================================================
407  Performs operation using LAPACK
408  =================================================================== */
409  start = get_current_time();
410  if (accuracyflag == 1)
411  lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info);
412  end = get_current_time();
413  if (info < 0)
414  printf("Argument %d of zgeqrf had an illegal value.\n", -info);
415 
416  cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end));
417 
418  /* =====================================================================
419  Check the result compared to LAPACK
420  =================================================================== */
421  double work[1], matnorm = 1.;
422  cuDoubleComplex mone = MAGMA_Z_NEG_ONE;
423  magma_int_t one = 1;
424 
425  if (accuracyflag == 1){
426  matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work);
427  blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one);
428  }
429 
430  if (accuracyflag == 1){
431  printf("%5d %5d %6.2f %6.2f %e\n",
432  M, N, cpu_perf, gpu_perf,
433  lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm);
434  } else {
435  printf("%5d %5d %6.2f \n",
436  M, N, gpu_perf);
437  }
438 
439  if (loop != 1)
440  break;
441  }
442 
443  /* Memory clean up */
444  TESTING_FREE ( h_A );
445  TESTING_FREE ( tau );
446  TESTING_HOSTFREE(h_work);
447  TESTING_HOSTFREE( h_R );
448 
449  /* Shut down the MAGMA context */
450  magma_finalize(context);
451 }