MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zhetrd_bhe2trc_v5.cpp File Reference
#include "common_magma.h"
#include "magma_bulge.h"
#include "magma_zbulge.h"
#include <cblas.h>
Include dependency graph for zhetrd_bhe2trc_v5.cpp:

Go to the source code of this file.

Data Structures

class  bulge_data
 
class  bulge_id_data
 
class  applyQ_data
 
class  applyQ_id_data
 

Macros

#define PRECISION_z
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 
#define E(m, n)   &(E[(m) + lde*(n)])
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 

Functions

void magma_zstedc_withZ (char JOBZ, magma_int_t n, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
 
void magma_zstedx_withZ (magma_int_t n, magma_int_t ne, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
 
static void * parallel_section (void *arg)
 
static void tile_bulge_parallel (magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t n, magma_int_t nb, magma_int_t nb_tiles, magma_int_t band, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog)
 
static void tile_bulge_computeT_parallel (magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
 
static void * applyQ_parallel_section (void *arg)
 
static void tile_bulge_applyQ (magma_int_t wantz, char side, magma_int_t n_cpu, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt)
 
magma_int_t magma_zhetrd_bhe2trc_v5 (magma_int_t threads, magma_int_t wantz, char uplo, magma_int_t ne, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, double *D, double *E, magmaDoubleComplex *dT1, magma_int_t ldt1)
 

Macro Definition Documentation

#define E (   m,
 
)    &(E[(m) + lde*(n)])

Definition at line 1281 of file zhetrd_bhe2trc_v5.cpp.

#define PRECISION_z

Definition at line 21 of file zhetrd_bhe2trc_v5.cpp.

#define T (   m)    &(T[(m)])

Definition at line 1284 of file zhetrd_bhe2trc_v5.cpp.

#define T (   m)    &(T[(m)])

Definition at line 1284 of file zhetrd_bhe2trc_v5.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 1283 of file zhetrd_bhe2trc_v5.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 1283 of file zhetrd_bhe2trc_v5.cpp.

#define V (   m)    &(V[(m)])

Definition at line 1282 of file zhetrd_bhe2trc_v5.cpp.

#define V (   m)    &(V[(m)])

Definition at line 1282 of file zhetrd_bhe2trc_v5.cpp.

Function Documentation

static void * applyQ_parallel_section ( void *  arg)
static

Definition at line 1139 of file zhetrd_bhe2trc_v5.cpp.

References barrier(), dE, E, magma_ceildiv(), magma_device_sync(), magma_setlapack_numthreads(), magma_wtime(), magma_zbulge_applyQ_v2(), magma_zsetmatrix, min, sys_corenbr, T, TAU, tile_bulge_applyQ(), and V.

1140 {
1141 
1142  magma_int_t my_core_id = ((applyQ_id_data*)arg) -> id;
1143  applyQ_data* data = ((applyQ_id_data*)arg) -> data;
1144 
1145  magma_int_t allcores_num = data -> threads_num;
1146  magma_int_t n = data -> n;
1147  magma_int_t ne = data -> ne;
1148  magma_int_t n_gpu = data -> n_gpu;
1149  magma_int_t nb = data -> nb;
1150  magma_int_t Vblksiz = data -> Vblksiz;
1151  magma_int_t wantz = data -> wantz;
1152  magmaDoubleComplex *E = data -> E;
1153  magma_int_t lde = data -> lde;
1154  magmaDoubleComplex *V = data -> V;
1155  magma_int_t ldv = data -> ldv;
1156  magmaDoubleComplex *TAU = data -> TAU;
1157  magmaDoubleComplex *T = data -> T;
1158  magma_int_t ldt = data -> ldt;
1159  magmaDoubleComplex *dE = data -> dE;
1160  magma_int_t ldde = data -> ldde;
1161  pthread_barrier_t* barrier = &(data -> barrier);
1162 
1163  magma_int_t info;
1164 
1165  real_Double_t timeQcpu=0.0, timeQgpu=0.0;
1166 
1167  magma_int_t n_cpu = ne - n_gpu;
1168 
1169  if(wantz<=0)
1170  return 0;
1171  // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
1172  // it need that all threads setting it to 1.
1174 
1175 #if defined(MAGMA_SETAFFINITY)
1176  cpu_set_t set;
1177  CPU_ZERO( &set );
1178  CPU_SET( my_core_id, &set );
1179  sched_setaffinity( 0, sizeof(set), &set) ;
1180 #endif
1181 
1182 
1183  /*################################################
1184  * WANTZ == 2
1185  *################################################*/
1186  if((wantz==2))
1187  {
1188 
1189  if(my_core_id == 0)
1190  {
1191  //=============================================
1192  // on GPU on thread 0:
1193  // - apply V2*Z(:,1:N_GPU)
1194  //=============================================
1195  timeQgpu = magma_wtime();
1196  magma_zbulge_applyQ_v2('R', n, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
1198  timeQgpu = magma_wtime()-timeQgpu;
1199  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1200  /* I am one of the remaining cores*/
1201  }else{
1202  //=============================================
1203  // on CPU on threads 1:allcores_num-1:
1204  // - apply V2*Z(:,N_GPU+1:N)
1205  //=============================================
1206  if(my_core_id == 1)timeQcpu = magma_wtime();
1207 
1208  magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
1209  magmaDoubleComplex* E_loc = E + n_gpu+ n_loc * (my_core_id-1);
1210  n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));
1211 
1212  tile_bulge_applyQ(wantz, 'R', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
1213  pthread_barrier_wait(barrier);
1214  if(my_core_id == 1){
1215  timeQcpu = magma_wtime()-timeQcpu;
1216  printf(" Finish Q2_CPU CCC timing= %lf \n" ,timeQcpu);
1217  }
1218 
1219  } // END if my_core_id
1220 
1221  }// END of WANTZ==2
1222 
1223 
1224  /*################################################
1225  * WANTZ == 3 || WANTZ == 4
1226  *################################################*/
1227  if((wantz==3) || (wantz==4))
1228  {
1229  /* I am the last core in the new indexing and the original core=0 */
1230  if(my_core_id==0)
1231  {
1232  //=============================================
1233  // on GPU on thread 0:
1234  // - apply V2*Z(:,1:N_GPU)
1235  //=============================================
1236  timeQgpu = magma_wtime();
1237  magma_zsetmatrix(n, n_gpu, E, lde, dE, ldde);
1238  magma_zbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
1240  timeQgpu = magma_wtime()-timeQgpu;
1241  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1242  /* I am one of the remaining cores*/
1243  }else{
1244  //=============================================
1245  // on CPU on threads 1:allcores_num-1:
1246  // - apply V2*Z(:,N_GPU+1:N)
1247  //=============================================
1248  if(my_core_id == 1)
1249  timeQcpu = magma_wtime();
1250 
1251  magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
1252  magmaDoubleComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
1253  n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));
1254 
1255  tile_bulge_applyQ(wantz, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
1256  pthread_barrier_wait(barrier);
1257  if(my_core_id == 1){
1258  timeQcpu = magma_wtime()-timeQcpu;
1259  printf(" Finish Q2_CPU CCC timing= %lf \n" ,timeQcpu);
1260  }
1261 
1262  } // END if my_core_id
1263 
1264  }// END of WANTZ==3 / 4
1265 
1266 #if defined(MAGMA_SETAFFINITY)
1267  // unbind threads
1269  sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
1270  CPU_ZERO( &set );
1271  for(magma_int_t i=0; i<sys_corenbr; i++)
1272  CPU_SET( i, &set );
1273  sched_setaffinity( 0, sizeof(set), &set) ;
1274 #endif
1275 
1276  return 0;
1277 }
#define min(a, b)
Definition: common_magma.h:86
void magma_device_sync()
magma_int_t ldv
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define E(m, n)
int magma_int_t
Definition: magmablas.h:12
static void tile_bulge_applyQ(magma_int_t wantz, char side, magma_int_t n_cpu, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt)
static void barrier(int my_core_id, int cores_num)
#define dE(m, n)
void magma_setlapack_numthreads(magma_int_t num_threads)
#define TAU(m)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
magma_int_t magma_zbulge_applyQ_v2(char side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *dE, magma_int_t ldde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info)
double magma_wtime(void)
Definition: timer.cpp:110
double real_Double_t
Definition: magma_types.h:27
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
static volatile int sys_corenbr
Definition: quarkos.c:68
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zhetrd_bhe2trc_v5 ( magma_int_t  threads,
magma_int_t  wantz,
char  uplo,
magma_int_t  ne,
magma_int_t  n,
magma_int_t  nb,
magmaDoubleComplex *  A,
magma_int_t  lda,
double *  D,
double *  E,
magmaDoubleComplex *  dT1,
magma_int_t  ldt1 
)

Definition at line 215 of file zhetrd_bhe2trc_v5.cpp.

References applyQ_parallel_section(), cblas_zcopy(), lapack_testing::f, magma_bulge_get_blkcnt(), magma_ceildiv(), magma_device_sync(), MAGMA_ERR_DEVICE_ALLOC, magma_free, magma_free_cpu(), magma_malloc_cpu(), magma_setlapack_numthreads(), MAGMA_SUCCESS, magma_wtime(), MAGMA_Z_ONE, MAGMA_Z_REAL, MAGMA_Z_ZERO, magma_zbulge_applyQ_v2(), magma_zgemm(), magma_zgetmatrix, magma_zmalloc(), magma_zmalloc_cpu(), magma_zsetmatrix, magma_zstedc_withZ(), magma_zstedx_withZ(), magma_zunmqr_gpu_2stages(), MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNoVec, min, parallel_section(), pthread_attr_init(), pthread_attr_setscope(), pthread_create(), pthread_join(), PTHREAD_SCOPE_SYSTEM, pthread_setconcurrency(), T, TAU, V, and Z.

220 {
222  magmaDoubleComplex *da, *Z;
223  if((wantz>0)/*&&(wantz!=4)*/){
224  if (MAGMA_SUCCESS != magma_zmalloc( &da, n*lda )) {
225  return MAGMA_ERR_DEVICE_ALLOC;
226  }
227 
228  }
229  magmaDoubleComplex *dQ1 = da;
230  magma_int_t lddq1=lda;
232  char uplo_[2] = {uplo, 0};
233  double timelpk=0.0, timeaplQ2=0.0;
234  double timeblg=0.0, timeeigen=0.0, timegemm =0.0;
235 
236  magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
237  magmaDoubleComplex c_one = MAGMA_Z_ONE;
238 
239  magma_int_t mklth = threads;
240  magma_int_t computeQ1 = 0;
241 
242  magma_int_t band=6, INgrsiz=1;
243 
244 #if (defined(PRECISION_s) || defined(PRECISION_d))
245  magma_int_t Vblksiz = min(nb,48);
246 #else
247  magma_int_t Vblksiz = min(nb,32);
248 #endif
249 
250  magma_int_t ldt = Vblksiz;
251  magma_int_t ldv = nb + Vblksiz - 1;
252  magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
253 
254  magma_int_t nbtiles = magma_ceildiv(n, nb);
255 
256  /*************************************************
257  * INITIALIZING MATRICES
258  * ***********************************************/
259  /* A1 is equal to A2 */
260  magma_int_t lda2 = nb+1+(nb-1);
261  if(lda2>n){
262  printf("error matrix too small N=%d NB=%d\n",n,nb);
263  return -14;
264  }
265  if((n<=0)||(nb<=0)){
266  printf("error N or NB <0 N=%d NB=%d\n",n,nb);
267  return -14;
268  }
269 
270  timelpk = magma_wtime();
271  /* copy the input matrix into a matrix A2 with band storage */
272  magmaDoubleComplex *A2;
273  magma_zmalloc_cpu(&A2, n*lda2);
274 
275  memset(A2 , 0, n*lda2*sizeof(magmaDoubleComplex));
276 
277  for (magma_int_t j = 0; j < n-nb; j++)
278  {
279  cblas_zcopy(nb+1, &A[j*(lda+1)], 1, &A2[j*lda2], 1);
280  memset(&A[j*(lda+1)], 0, (nb+1)*sizeof(magmaDoubleComplex));
281  A[nb + j*(lda+1)] = c_one;
282  }
283  for (magma_int_t j = 0; j < nb; j++)
284  {
285  cblas_zcopy(nb-j, &A[(j+n-nb)*(lda+1)], 1, &A2[(j+n-nb)*lda2], 1);
286  memset(&A[(j+n-nb)*(lda+1)], 0, (nb-j)*sizeof(magmaDoubleComplex));
287  }
288 
289  if(wantz>0)
290  magma_zsetmatrix( n, n, A, lda, dQ1, lddq1 );
291 
292  timelpk = magma_wtime() - timelpk;
293  printf(" Finish CONVERT timing= %lf \n" ,timelpk);
294 
295  magmaDoubleComplex *T;
296  magmaDoubleComplex *TAU;
297  magmaDoubleComplex *V;
298 
299  magma_zmalloc_cpu(&T, blkcnt*ldt*Vblksiz);
300  magma_zmalloc_cpu(&TAU, blkcnt*Vblksiz );
301  magma_zmalloc_cpu(&V, blkcnt*ldv*Vblksiz);
302 
303  memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(magmaDoubleComplex));
304  memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex));
305  memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(magmaDoubleComplex));
306 
307  magma_int_t* prog;
308  magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t));
309  memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t));
310 
311  bulge_id_data* arg;
312  magma_malloc_cpu((void**) &arg, threads*sizeof(bulge_id_data));
313 
314  pthread_t* thread_id;
315  magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
316 
317  pthread_attr_t thread_attr;
318 
320 
321  if (wantz==2 || wantz==3)
322  computeQ1 = 1;
323 
324  bulge_data data_bulge(threads, n, nb, nbtiles, band, INgrsiz, Vblksiz, wantz,
325  A2, lda2, V, ldv, TAU, T, ldt, computeQ1, dQ1, lddq1, dT1, prog);
326 
327  // Set one thread per core
328  pthread_attr_init(&thread_attr);
330  pthread_setconcurrency(threads);
331 
332  //timing
333  timeblg = magma_wtime();
334 
335  // Launch threads
336  for (magma_int_t thread = 1; thread < threads; thread++)
337  {
338  arg[thread] = bulge_id_data(thread, &data_bulge);
339  pthread_create(&thread_id[thread], &thread_attr, parallel_section, &arg[thread]);
340  }
341  arg[0] = bulge_id_data(0, &data_bulge);
342  parallel_section(&arg[0]);
343 
344  // Wait for completion
345  for (magma_int_t thread = 1; thread < threads; thread++)
346  {
347  void *exitcodep;
348  pthread_join(thread_id[thread], &exitcodep);
349  }
350 
351  // timing
352  timeblg = magma_wtime()-timeblg;
353 
354 
355  magma_free_cpu(thread_id);
356  magma_free_cpu(arg);
357  magma_free_cpu(prog);
358 
359  printf(" Finish BULGE+T timing= %lf \n" ,timeblg);
360 
361  /*================================================
362  * store resulting diag and lower diag D and E
363  * note that D and E are always real
364  *================================================*/
365 
366  /* Make diagonal and superdiagonal elements real,
367  * storing them in D and E
368  */
369  /* In complex case, the off diagonal element are
370  * not necessary real. we have to make off-diagonal
371  * elements real and copy them to E.
372  * When using HouseHolder elimination,
373  * the ZLARFG give us a real as output so, all the
374  * diagonal/off-diagonal element except the last one are already
375  * real and thus we need only to take the abs of the last
376  * one.
377  * */
378 
379 #if defined(PRECISION_z) || defined(PRECISION_c)
380  if(uplo==MagmaLower){
381  for (magma_int_t i=0; i < n-1 ; i++)
382  {
383  D[i] = MAGMA_Z_REAL(A2[i*lda2 ]);
384  E[i] = MAGMA_Z_REAL(A2[i*lda2+1]);
385  }
386  D[n-1] = MAGMA_Z_REAL(A2[(n-1)*lda2]);
387  } else { /* MagmaUpper not tested yet */
388  for (magma_int_t i=0; i<n-1; i++)
389  {
390  D[i] = MAGMA_Z_REAL(A2[i*lda2+nb]);
391  E[i] = MAGMA_Z_REAL(A2[i*lda2+nb-1]);
392  }
393  D[n-1] = MAGMA_Z_REAL(A2[(n-1)*lda2+nb]);
394  } /* end MagmaUpper */
395 #else
396  if( uplo == MagmaLower ){
397  for (magma_int_t i=0; i < n-1; i++) {
398  D[i] = A2[i*lda2]; // diag
399  E[i] = A2[i*lda2+1]; //lower diag
400  }
401  D[n-1] = A2[(n-1)*lda2];
402  } else {
403  for (magma_int_t i=0; i < n-1; i++) {
404  D[i] = A2[i*lda2+nb]; // diag
405  E[i] = A2[i*lda2+nb-1]; //lower diag
406  }
407  D[n-1] = A2[(n-1)*lda2+nb];
408  }
409 #endif
410 /*
411  if (wantz != 4)
412  magma_free( dT1 ); //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
413 */
414  magma_int_t ldz=n;
415  magmaDoubleComplex *dZ;
416  magma_int_t info;
417 
418  // in case for tridiagonal testing
419  if (wantz < 0){
420  goto fin;
421  }
422 
424 
425  timeeigen = magma_wtime();
426  if(wantz==0){
427  timelpk = magma_wtime();
428 
429  magma_zstedc_withZ(MagmaNoVec, n, D, E, NULL, 1);
430  //magma_zstedc_withZ(MagmaNoVec, n, D, E, Z, ldz);
431 
432  timelpk = magma_wtime()-timelpk;
433  printf(" Finish WANTZ %d eigensolver 'N' timing= %lf threads %d \n" ,wantz, timelpk, mklth);
434 
435  }
436  else {
437 
438  timelpk = magma_wtime();
439 
440  magma_zmalloc_cpu (&Z, n*n);
441 
442  magma_zstedx_withZ(n, ne, D, E, Z, ldz);
443 
445 
446  timelpk = magma_wtime()-timelpk;
447 
448  if(wantz==2){
450  // WANTZ 2
452 
453  double f= 1.;
454  magma_int_t n_gpu = n;
455 
456  if(threads>40){
457  f = 0.5;
458  n_gpu = (magma_int_t)(f*n)/64*64;
459  }
460  else if(threads>10){
461 #if (defined(PRECISION_s) || defined(PRECISION_d))
462  f = 0.68;
463 #else
464  f = 0.72;
465 #endif
466  n_gpu = (magma_int_t)(f*n)/64*64;
467  }
468  else if(threads>5){
469 #if (defined(PRECISION_s) || defined(PRECISION_d))
470  f = 0.84;
471 #else
472  f = 0.86;
473 #endif
474  n_gpu = (magma_int_t)(f*n)/64*64;
475  }
476 
477  /****************************************************
478  * apply V2 from Right to Q1. da = da*(I-V2*T2*V2')
479  * **************************************************/
480  timeaplQ2 = magma_wtime();
481  /*============================
482  * use GPU+CPU's
483  *==========================*/
484  if(n_gpu < n)
485  {
486 
487  // define the size of Q to be done on CPU's and the size on GPU's
488  // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
489 
490  magma_zgetmatrix( n-n_gpu, n, &da[n_gpu], lda, &A[n_gpu], lda);
491 
492  printf("---> calling GPU + CPU to apply V2 to Q1 with N %d N_GPU %d N_CPU %d\n",n, n_gpu, n-n_gpu);
493 
494  applyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, wantz, A, lda, V, ldv, TAU, T, ldt, da, lda);
495 
496  applyQ_id_data* arg;
497  magma_malloc_cpu((void**) &arg, threads*sizeof(applyQ_id_data));
498 
499  pthread_t* thread_id;
500  magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
501 
502  //pthread_attr_t thread_attr;
503 
504  // ===============================
505  // relaunch thread to apply Q
506  // ===============================
507  // Set one thread per core
508  pthread_attr_init(&thread_attr);
510  pthread_setconcurrency(threads);
511 
512  // Launch threads
513  for (magma_int_t thread = 1; thread < threads; thread++)
514  {
515  arg[thread] = applyQ_id_data(thread, &data_applyQ);
516  pthread_create(&thread_id[thread], &thread_attr, applyQ_parallel_section, &arg[thread]);
517  }
518  arg[0] = applyQ_id_data(0, &data_applyQ);
519  applyQ_parallel_section(&arg[0]);
520 
521  // Wait for completion
522  for (magma_int_t thread = 1; thread < threads; thread++)
523  {
524  void *exitcodep;
525  pthread_join(thread_id[thread], &exitcodep);
526  }
527 
528  magma_free_cpu(thread_id);
529  magma_free_cpu(arg);
530 
531  magma_zsetmatrix( n-n_gpu, n, &A[n_gpu], lda, &da[n_gpu], lda);
532 
533  /*============================
534  * use only GPU
535  *==========================*/
536  }else{
537  magma_zbulge_applyQ_v2('R', n, n, nb, Vblksiz, dQ1, n, V, ldv, T, ldt, &info);
538  }
539  timeaplQ2 = magma_wtime()-timeaplQ2;
540 
541  /****************************************************
542  * compute the GEMM of Q*Z
543  * **************************************************/
544 
545  if(MAGMA_SUCCESS != magma_zmalloc( &dZ, n*ne )) {
546  printf ("!!!! magma_alloc failed for: dZ\n" );
547  exit(-1);
548  }
549  timegemm = magma_wtime();
550  // copy the eigenvectors to GPU
551  magma_zsetmatrix( n, ne, Z, ldz, dZ, n );
552  //make a gemm of (Q1 * Q2) * Z = da * dZ --> dZ2
553  magmaDoubleComplex *dZ2;
554  if(MAGMA_SUCCESS != magma_zmalloc( &dZ2, n*ne )) {
555  printf ("!!!! magma_alloc failed for: dZ2\n" );
556  exit(-1);
557  }
558  magma_zgemm( MagmaNoTrans, MagmaNoTrans, n, ne, n, c_one, da, n, dZ, n, c_zero, dZ2, n);
559  magma_zgetmatrix( n, ne, dZ2, n, A, lda );
560  magma_free(dZ2);
561  timegemm = magma_wtime()-timegemm;
562  }
563  else if(wantz==3 || wantz==4){
564 
566  // WANTZ 3 and 4
568 
569  double f= 1.;
570  magma_int_t n_gpu = ne;
571 
572  if(threads>40){
573  f = 0.5;
574  n_gpu = (magma_int_t)(f*ne)/64*64;
575  }
576  else if(threads>10){
577 #if (defined(PRECISION_s) || defined(PRECISION_d))
578  f = 0.68;
579 #else
580  f = 0.72;
581 #endif
582  n_gpu = (magma_int_t)(f*ne)/64*64;
583  }
584  else if(threads>5){
585 #if (defined(PRECISION_s) || defined(PRECISION_d))
586  f = 0.82;
587 #else
588  f = 0.86;
589 #endif
590  n_gpu = (magma_int_t)(f*ne)/64*64;
591  }
592  else if(threads>2){
593 #if (defined(PRECISION_s) || defined(PRECISION_d))
594  f = 0.96;
595 #else
596  f = 0.96;
597 #endif
598  n_gpu = (magma_int_t)(f*ne)/64*64;
599  }
600 
601  /****************************************************
602  * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
603  * **************************************************/
604  if(MAGMA_SUCCESS != magma_zmalloc( &dZ, n*ne )) {
605  printf ("!!!! magma_alloc failed for: dZ\n" );
606  exit(-1);
607  }
608  timeaplQ2 = magma_wtime();
609 
610  /*============================
611  * use GPU+CPU's
612  *==========================*/
613  if(n_gpu < ne)
614  {
615 
616  // define the size of Q to be done on CPU's and the size on GPU's
617  // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
618 
619  printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu);
620 
621  applyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, wantz, Z, ldz, V, ldv, TAU, T, ldt, dZ, n);
622 
623  applyQ_id_data* arg;
624  magma_malloc_cpu((void**) &arg, threads*sizeof(applyQ_id_data));
625 
626  pthread_t* thread_id;
627  magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
628 
629  pthread_attr_t thread_attr;
630 
631  // ===============================
632  // relaunch thread to apply Q
633  // ===============================
634  // Set one thread per core
635  pthread_attr_init(&thread_attr);
637  pthread_setconcurrency(threads);
638 
639  // Launch threads
640  for (magma_int_t thread = 1; thread < threads; thread++)
641  {
642  arg[thread] = applyQ_id_data(thread, &data_applyQ);
643  pthread_create(&thread_id[thread], &thread_attr, applyQ_parallel_section, &arg[thread]);
644  }
645  arg[0] = applyQ_id_data(0, &data_applyQ);
646  applyQ_parallel_section(&arg[0]);
647 
648  // Wait for completion
649  for (magma_int_t thread = 1; thread < threads; thread++)
650  {
651  void *exitcodep;
652  pthread_join(thread_id[thread], &exitcodep);
653  }
654 
655  magma_free_cpu(thread_id);
656  magma_free_cpu(arg);
657 
658  magma_zsetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, n);
659 
660  /*============================
661  * use only GPU
662  *==========================*/
663  }else{
664  magma_zsetmatrix(n, ne, Z, ldz, dZ, n);
665  magma_zbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, n, V, ldv, T, ldt, &info);
667  }
668  timeaplQ2 = magma_wtime()-timeaplQ2;
669 
670  if (wantz==3){
671  /****************************************************
672  * compute the GEMM of Q1 * (Q2*Z)
673  * **************************************************/
674  printf("calling zgemm\n");
675  timegemm = magma_wtime();
676  //make a gemm of Q1 * (Q2 * Z) = Q1 * ((I-V2T2V2')*Z) = da * dZ --> dZ2
677  magmaDoubleComplex *dZ2;
678  if(MAGMA_SUCCESS != magma_zmalloc( &dZ2, n*ne )) {
679  printf ("!!!! magma_alloc failed for: dZ2\n" );
680  exit(-1);
681  }
682  magma_zgemm( MagmaNoTrans, MagmaNoTrans, n, ne, n, c_one, dQ1, lddq1, dZ, n, c_zero, dZ2, n);
683  magma_zgetmatrix( n, ne, dZ2, n, A, lda );
684  magma_free(dZ2);
685  timegemm = magma_wtime()-timegemm;
686  }else{
687  /****************************************************
688  * apply Q1 to (Q2*Z)
689  * **************************************************/
690  printf("calling zunmqr_gpu_2stages\n");
691  timegemm = magma_wtime();
692  magma_zunmqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, ne, n-nb, dQ1+nb, lddq1,
693  dZ+nb, n, dT1, nb, &info);
694  magma_free(dT1);
695  magma_zgetmatrix( n, ne, dZ, n, A, lda );
696  timegemm = magma_wtime()-timegemm;
697  }
698  }
700 
701  magma_free_cpu(Z);
702  magma_free(dZ);
703  magma_free(dQ1);
704  }
705  timeeigen = magma_wtime()-timeeigen;
706 
707 
708 fin:
709 
710  magma_free_cpu(A2);
711  magma_free_cpu(TAU);
712  magma_free_cpu(V);
713  magma_free_cpu(T);
714  //magma_free_pinned(V);
715  //magma_free_pinned(T);
716 
717  printf("============================================================================\n");
718  printf(" Finish WANTZ %d computing Q2 timing= %lf \n" ,wantz, timeaplQ2);
719 
720  printf(" Finish WANTZ %d gemm Q1 / apply Q1 timing= %lf \n" ,wantz, timegemm);
721  printf(" Finish WANTZ %d eigensolver 'I' timing= %lf threads %d N %d NE %d\n" ,wantz, timelpk, mklth, n, ne);
722 
723  printf(" Finish WANTZ %d full Eigenvectros timing= %lf \n",wantz, timeeigen);
724  printf("============================================================================\n");
725 
726 
727 
728  return MAGMA_SUCCESS;
729 }
#define min(a, b)
Definition: common_magma.h:86
void magma_device_sync()
void magma_zstedc_withZ(char JOBZ, magma_int_t N, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
Definition: zbulge_aux.cpp:20
#define MagmaLeft
Definition: magma.h:68
MAGMA_DLLPORT int MAGMA_CDECL pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start)(void *), void *arg)
magma_int_t ldv
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define E(m, n)
static void * parallel_section(void *arg)
#define magma_free(ptr)
Definition: magma.h:57
MAGMA_DLLPORT int MAGMA_CDECL pthread_join(pthread_t thread, void **value_ptr)
#define magma_zgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_z.h:705
int magma_int_t
Definition: magmablas.h:12
magma_err_t magma_malloc_cpu(void **ptrPtr, size_t bytes)
magma_int_t magma_zunmqr_gpu_2stages(char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dC, magma_int_t lddc, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
#define PTHREAD_SCOPE_SYSTEM
#define Z(ix, iy)
Definition: dstedx.cpp:14
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_setscope(pthread_attr_t *attr, int scope)
#define MagmaNoVec
Definition: magma_types.h:334
#define MagmaLower
Definition: magma.h:62
void magma_setlapack_numthreads(magma_int_t num_threads)
#define TAU(m)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
magma_int_t magma_zbulge_applyQ_v2(char side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *dE, magma_int_t ldde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info)
#define MAGMA_Z_ZERO
Definition: magma.h:131
static void * applyQ_parallel_section(void *arg)
void magma_zstedx_withZ(magma_int_t N, magma_int_t NE, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
Definition: zbulge_aux.cpp:67
double magma_wtime(void)
Definition: timer.cpp:110
#define A(i, j)
Definition: cprint.cpp:16
#define MAGMA_SUCCESS
Definition: magma.h:106
static magma_err_t magma_zmalloc_cpu(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:86
void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY)
MAGMA_DLLPORT int MAGMA_CDECL pthread_setconcurrency(int level)
#define MAGMA_Z_ONE
Definition: magma.h:132
#define MagmaNoTrans
Definition: magma.h:57
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
magma_err_t magma_free_cpu(void *ptr)
#define V(m)
#define MAGMA_Z_REAL(a)
Definition: magma.h:124
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_init(pthread_attr_t *attr)
int pthread_attr_t

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_zstedc_withZ ( char  JOBZ,
magma_int_t  n,
double *  D,
double *  E,
magmaDoubleComplex *  Z,
magma_int_t  LDZ 
)

Definition at line 20 of file zbulge_aux.cpp.

References lapackf77_zstedc, and magma_free_cpu().

20  {
21  magmaDoubleComplex *WORK;
22  double *RWORK;
23  magma_int_t *IWORK;
24  magma_int_t LWORK, LIWORK, LRWORK;
25  magma_int_t INFO;
26 
27  if(JOBZ=='V'){
28  LWORK = N*N;
29  LRWORK = 1 + 3*N + 3*N*((magma_int_t)log2(N)+1) + 4*N*N+ 256*N;
30  LIWORK = 6 + 6*N + 6*N*((magma_int_t)log2(N)+1) + 256*N;
31  }else if(JOBZ=='I'){
32  LWORK = N;
33  LRWORK = 2*N*N+4*N+1+256*N;
34  LIWORK = 256*N;
35  }else if(JOBZ=='N'){
36  LWORK = N;
37  LRWORK = 256*N+1;
38  LIWORK = 256*N;
39  }else{
40  printf("ERROR JOBZ %c\n",JOBZ);
41  exit(-1);
42  }
43 
44  RWORK = (double*) malloc( LRWORK*sizeof( double) );
45  WORK = (magmaDoubleComplex*) malloc( LWORK*sizeof( magmaDoubleComplex) );
46  IWORK = (magma_int_t*) malloc( LIWORK*sizeof( magma_int_t) );
47 
48  lapackf77_zstedc(&JOBZ, &N, D, E, Z, &LDZ, WORK, &LWORK, RWORK, &LRWORK, IWORK, &LIWORK, &INFO);
49 
50  if(INFO!=0){
51  printf("=================================================\n");
52  printf("ZSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO);
53  printf("=================================================\n");
54  //assert(INFO==0);
55  }
56 
57 
58  magma_free_cpu( IWORK );
59  magma_free_cpu( WORK );
60  magma_free_cpu( RWORK );
61 }
int magma_int_t
Definition: magmablas.h:12
#define Z(ix, iy)
Definition: dstedx.cpp:14
#define lapackf77_zstedc
Definition: magma_zlapack.h:95
#define E(m, n)
magma_err_t magma_free_cpu(void *ptr)

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_zstedx_withZ ( magma_int_t  n,
magma_int_t  ne,
double *  D,
double *  E,
magmaDoubleComplex *  Z,
magma_int_t  LDZ 
)

Definition at line 67 of file zbulge_aux.cpp.

References dwork, get_current_time(), GetTimerValue(), magma_dmalloc(), magma_free, magma_free_cpu(), MAGMA_SUCCESS, and magma_zstedx().

67  {
68  double *RWORK;
69  double *dwork;
70  magma_int_t *IWORK;
71  magma_int_t LWORK, LIWORK, LRWORK;
72  magma_int_t INFO;
73 
74  LWORK = N;
75  LRWORK = 2*N*N+4*N+1+256*N;
76  LIWORK = 256*N;
77 
78  RWORK = (double*) malloc( LRWORK*sizeof( double) );
79  IWORK = (magma_int_t*) malloc( LIWORK*sizeof( magma_int_t) );
80 
81  if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*N*(N/2 + 1) )) {
82  printf("=================================================\n");
83  printf("ZSTEDC ERROR OCCURED IN CUDAMALLOC\n");
84  printf("=================================================\n");
85  return;
86  }
87  printf("using magma_zstedx\n");
88 
89 #ifdef ENABLE_TIMER
90  magma_timestr_t start, end;
91  start = get_current_time();
92 #endif
93 
94  char job = 'I';
95 
96  if(NE==N)
97  job = 'A';
98 
99  magma_zstedx(job, N, 0.,0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO);
100 
101  if(INFO!=0){
102  printf("=================================================\n");
103  printf("ZSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO);
104  printf("=================================================\n");
105  //assert(INFO==0);
106  }
107 
108 #ifdef ENABLE_TIMER
109  end = get_current_time();
110  printf("time zstevx = %6.2f\n", GetTimerValue(start,end)/1000.);
111 #endif
112 
113  magma_free( dwork );
114  magma_free_cpu( IWORK );
115  magma_free_cpu( RWORK );
116 }
static magma_err_t magma_dmalloc(magmaDouble_ptr *ptrPtr, size_t n)
Definition: magma.h:78
#define magma_free(ptr)
Definition: magma.h:57
int magma_int_t
Definition: magmablas.h:12
#define dwork(dev, i, j)
#define Z(ix, iy)
Definition: dstedx.cpp:14
#define MAGMA_SUCCESS
Definition: magma.h:106
#define E(m, n)
magma_int_t magma_zstedx(char range, magma_int_t n, double vl, double vu, magma_int_t il, magma_int_t iu, double *D, double *E, magmaDoubleComplex *Z, magma_int_t ldz, double *rwork, magma_int_t ldrwork, magma_int_t *iwork, magma_int_t liwork, double *dwork, magma_int_t *info)
Definition: zstedx.cpp:16
double GetTimerValue(magma_timestr_t start, magma_timestr_t end)
Definition: timer.cpp:94
magma_err_t magma_free_cpu(void *ptr)
magma_timestr_t get_current_time(void)
Definition: timer.cpp:76

Here is the call graph for this function:

Here is the caller graph for this function:

static void * parallel_section ( void *  arg)
static

Definition at line 733 of file zhetrd_bhe2trc_v5.cpp.

References A, barrier(), magma_device_sync(), magma_setlapack_numthreads(), magma_wtime(), magma_zungqr_2stage_gpu(), sys_corenbr, T, TAU, tile_bulge_computeT_parallel(), tile_bulge_parallel(), and V.

734 {
735  magma_int_t my_core_id = ((bulge_id_data*)arg) -> id;
736  bulge_data* data = ((bulge_id_data*)arg) -> data;
737 
738  magma_int_t allcores_num = data -> threads_num;
739  magma_int_t n = data -> n;
740  magma_int_t nb = data -> nb;
741  magma_int_t nbtiles = data -> nbtiles;
742  magma_int_t band = data -> band;
743  magma_int_t grsiz = data -> grsiz;
744  magma_int_t Vblksiz = data -> Vblksiz;
745  magma_int_t wantz = data -> wantz;
746  magmaDoubleComplex *A = data -> A;
747  magma_int_t lda = data -> lda;
748  magmaDoubleComplex *V = data -> V;
749  magma_int_t ldv = data -> ldv;
750  magmaDoubleComplex *TAU = data -> TAU;
751  magmaDoubleComplex *T = data -> T;
752  magma_int_t ldt = data -> ldt;
753  magma_int_t computeQ1 = data -> computeQ1;
754  magmaDoubleComplex *dQ1 = data -> dQ1;
755  magma_int_t lddq1 = data -> lddq1;
756  magmaDoubleComplex *dT1 = data -> dT1;
757  volatile magma_int_t* prog = data -> prog;
758 
759  pthread_barrier_t* barrier = &(data -> barrier);
760 
762 
763  magma_int_t info;
764 
765  double timeB=0.0, timeT=0.0, timeaplQ1=0.0;
766 
767  // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
768  // it need that all threads setting it to 1.
770 
771 #if defined(MAGMA_SETAFFINITY)
772  // bind threads
773  cpu_set_t set;
774  // bind threads
775  CPU_ZERO( &set );
776  CPU_SET( my_core_id, &set );
777  sched_setaffinity( 0, sizeof(set), &set) ;
778 #endif
779 
780  if((wantz>0))
781  {
782  /* compute the Q1 overlapped with the bulge chasing+T.
783  * if all_cores_num=1 it call Q1 on GPU and then bulgechasing.
784  * otherwise the first thread run Q1 on GPU and
785  * the other threads run the bulgechasing.
786  * */
787 
788  if(allcores_num==1)
789  {
790  //=========================
791  // compute Q1
792  //=========================
793  if(computeQ1==1){
795  timeaplQ1 = magma_wtime();
796 
797  magma_zungqr_2stage_gpu(n, n, n, dQ1, lddq1, NULL, dT1, nb, &info);
799 
800  timeaplQ1 = magma_wtime()-timeaplQ1;
801  printf(" Finish applyQ1 timing= %lf \n" ,timeaplQ1);
802  }
803 
804  //=========================
805  // bulge chasing
806  //=========================
807  timeB = magma_wtime();
808 
809  tile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, band, grsiz, Vblksiz, prog);
810 
811  timeB = magma_wtime()-timeB;
812  printf(" Finish BULGE timing= %lf \n" ,timeB);
813 
814 
815  //=========================
816  // compute the T's to be used when applying Q2
817  //=========================
818  timeT = magma_wtime();
819  tile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz);
820 
821  timeT = magma_wtime()-timeT;
822  printf(" Finish T's timing= %lf \n" ,timeT);
823 
824  }else{ // allcore_num > 1
825 
826  magma_int_t id = -1;
827  magma_int_t tot = -1;
828 
829  if(computeQ1==1){
830  id = my_core_id-1;
831  tot = allcores_num-1;
832  }
833  else {
834  id = my_core_id;
835  tot = allcores_num;
836  }
837 
838  if(computeQ1 == 1 && my_core_id == 0)
839  {
840  //=============================================
841  // compute Q1 on last newcoreid
842  //=============================================
844  timeaplQ1 = magma_wtime();
845 
846  magma_zungqr_2stage_gpu(n, n, n, dQ1, lddq1, NULL, dT1, nb, &info);
848 
849  timeaplQ1 = magma_wtime()-timeaplQ1;
850  printf(" Finish applyQ1 timing= %lf \n" ,timeaplQ1);
851 
852  }else{
853  //=========================
854  // bulge chasing
855  //=========================
856  if(id == 0)timeB = magma_wtime();
857 
858  tile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, band, grsiz, Vblksiz, prog);
859  pthread_barrier_wait(barrier);
860 
861  if(id == 0){
862  timeB = magma_wtime()-timeB;
863  printf(" Finish BULGE timing= %lf \n" ,timeB);
864  }
865 
866  //=========================
867  // compute the T's to be used when applying Q2
868  //=========================
869  if(id == 0)timeT = magma_wtime();
870 
871  tile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz);
872  pthread_barrier_wait(barrier);
873 
874  if (id == 0){
875  timeT = magma_wtime()-timeT;
876  printf(" Finish T's timing= %lf \n" ,timeT);
877  }
878  }
879 
880  } // allcore == 1
881 
882  }else{ // WANTZ = 0
883 
884  //=========================
885  // bulge chasing
886  //=========================
887  if(my_core_id == 0)
888  timeB = magma_wtime();
889 
890  tile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, band, grsiz, Vblksiz, prog);
891 
892  pthread_barrier_wait(barrier);
893 
894  if(my_core_id == 0){
895  timeB = magma_wtime()-timeB;
896  printf(" Finish BULGE timing= %lf \n" ,timeB);
897  }
898  } // WANTZ > 0
899 
900 #if defined(MAGMA_SETAFFINITY)
901  // unbind threads
902  sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
903  CPU_ZERO( &set );
904  for(magma_int_t i=0; i<sys_corenbr; i++)
905  CPU_SET( i, &set );
906  sched_setaffinity( 0, sizeof(set), &set) ;
907 #endif
908 
909  return 0;
910 }
void magma_device_sync()
magma_int_t ldv
#define T(m)
Definition: zgeqrf_mc.cpp:14
int magma_int_t
Definition: magmablas.h:12
static void tile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t n, magma_int_t nb, magma_int_t nb_tiles, magma_int_t band, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog)
static void barrier(int my_core_id, int cores_num)
static void tile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
magma_int_t magma_zungqr_2stage_gpu(magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
void magma_setlapack_numthreads(magma_int_t num_threads)
#define TAU(m)
double magma_wtime(void)
Definition: timer.cpp:110
#define A(i, j)
Definition: cprint.cpp:16
static volatile int sys_corenbr
Definition: quarkos.c:68
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

static void tile_bulge_applyQ ( magma_int_t  wantz,
char  side,
magma_int_t  n_cpu,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  Vblksiz,
magmaDoubleComplex *  E,
magma_int_t  lde,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magmaDoubleComplex *  T,
magma_int_t  ldt 
)
static

Definition at line 1285 of file zhetrd_bhe2trc_v5.cpp.

References E, lapackf77_zlarfb, magma_bulge_findVTpos(), magma_ceildiv(), magma_free_cpu(), magma_zmalloc_cpu(), max, min, T, and V.

1287 {
1288  //%===========================
1289  //% local variables
1290  //%===========================
1291  magma_int_t firstcolj;
1292  magma_int_t bg, rownbm;
1293  magma_int_t st,ed,fst,vlen,vnb,colj;
1294  magma_int_t vpos,tpos;
1295  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
1296  magma_int_t nbgr, colst, coled;
1297 
1298  if(n<=0)
1299  return ;
1300  if(n_loc<=0)
1301  return ;
1302 
1303  //info = 0;
1304  magma_int_t INFO=0;
1305 
1306  magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz);
1307 
1308  /* use colpercore = N/cores_num; :if i want to split E into
1309  * cores_num chunk so I will have large chunk for each core.
1310  * However I prefer to split E into chunk of small size where
1311  * I guarantee that blas3 occur and the size of chunk remain into
1312  * cache, I think it is better. than I will have different chunk of
1313  * small sizeper coreand i will do each chunk till the end and
1314  * then move tothe second one for data locality
1315  *
1316  * version v1: for each chunck it apply all the V's then move to
1317  * the other chunck. the locality here inside each
1318  * chunck meaning that thread t apply V_k then move
1319  * to V_k+1 which overlap with V_k meaning that the
1320  * E_k+1 overlap with E_k. so here is the
1321  * locality however thread t had to read V_k+1 and
1322  * T_k+1 at each apply. note that all thread if they
1323  * run at same speed they might reading the same V_k
1324  * and T_k at the same time.
1325  * */
1326 
1327  magma_int_t nb_loc = 128; //$$$$$$$$
1328 
1329  magma_int_t lwork = 2*nb_loc*max(Vblksiz,64);
1330  magmaDoubleComplex *work, *work2;
1331 
1332  magma_zmalloc_cpu(&work, lwork);
1333  magma_zmalloc_cpu(&work2, lwork);
1334 
1335  magma_int_t nbchunk = magma_ceildiv(n_loc, nb_loc);
1336 
1337  /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
1338  * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning
1339  * that q_i+1 overlap a portion of the E(row_i, :).
1340  * IN parallel E is splitten in vertical block over the threads */
1341  /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
1342  * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning
1343  * that q_i+1 overlap a portion of the E(:, col_i).
1344  * IN parallel E is splitten in horizontal block over the threads */
1345 
1346  printf(" APPLY Q2 N %d N_loc %d nbchunk %d NB %d Vblksiz %d SIDE %c WANTZ %d \n", n, n_loc, nbchunk, nb, Vblksiz, side, wantz);
1347  for (magma_int_t i = 0; i<nbchunk; i++)
1348  {
1349  magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc));
1350 
1351  if(side=='L'){
1352  for (bg = nbGblk; bg>0; bg--)
1353  {
1354  firstcolj = (bg-1)*Vblksiz + 1;
1355  rownbm = magma_ceildiv((n-(firstcolj+1)),nb);
1356  if(bg==nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb); // last blk has size=1 used for complex to handle A(N,N-1)
1357  for (magma_int_t j = rownbm; j>0; j--)
1358  {
1359  vlen = 0;
1360  vnb = 0;
1361  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
1362  fst = (rownbm -j)*nb+colj +1;
1363  for (magma_int_t k=0; k<Vblksiz; k++)
1364  {
1365  colj = (bg-1)*Vblksiz + k;
1366  st = (rownbm -j)*nb+colj +1;
1367  ed = min(st+nb-1,n-1);
1368  if(st>ed)
1369  break;
1370  if((st==ed)&&(colj!=n-2))
1371  break;
1372  vlen=ed-fst+1;
1373  vnb=k+1;
1374  }
1375  colst = (bg-1)*Vblksiz;
1376  magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);
1377 
1378  if((vlen>0)&&(vnb>0)){
1379  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc);
1380  }
1381  if(INFO!=0)
1382  printf("ERROR ZUNMQR INFO %d \n",INFO);
1383  }
1384  }
1385  }else if (side=='R'){
1386  rownbm = magma_ceildiv((n-1),nb);
1387  for (magma_int_t k = 1; k<=rownbm; k++)
1388  {
1389  ncolinvolvd = min(n-1, k*nb);
1390  avai_blksiz=min(Vblksiz,ncolinvolvd);
1391  nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz);
1392  for (magma_int_t j = 1; j<=nbgr; j++)
1393  {
1394  vlen = 0;
1395  vnb = 0;
1396  cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz);
1397  colst = (j-1)*avai_blksiz;
1398  coled = colst + cur_blksiz -1;
1399  fst = (rownbm -k)*nb+colst +1;
1400  for (colj=colst; colj<=coled; colj++)
1401  {
1402  st = (rownbm -k)*nb+colj +1;
1403  ed = min(st+nb-1,n-1);
1404  if(st>ed)
1405  break;
1406  if((st==ed)&&(colj!=n-2))
1407  break;
1408  vlen=ed-fst+1;
1409  vnb=vnb+1;
1410  }
1411  magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);
1412  if((vlen>0)&&(vnb>0)){
1413  lapackf77_zlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc);
1414  }
1415  }
1416  }
1417  }else{
1418  printf("ERROR SIDE %d \n",side);
1419  }
1420  } // END loop over the chunks
1421 
1422  magma_free_cpu(work);
1423  magma_free_cpu(work2);
1424 
1425 }
#define min(a, b)
Definition: common_magma.h:86
magma_int_t ldv
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define E(m, n)
#define lapackf77_zlarfb
Definition: magma_zlapack.h:78
int magma_int_t
Definition: magmablas.h:12
void magma_bulge_findVTpos(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t ldv, magma_int_t ldt, magma_int_t *Vpos, magma_int_t *Tpos)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
static magma_err_t magma_zmalloc_cpu(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:86
#define max(a, b)
Definition: common_magma.h:82
magma_err_t magma_free_cpu(void *ptr)
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

static void tile_bulge_computeT_parallel ( magma_int_t  my_core_id,
magma_int_t  cores_num,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magmaDoubleComplex *  T,
magma_int_t  ldt,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  Vblksiz 
)
static

Definition at line 1075 of file zhetrd_bhe2trc_v5.cpp.

References lapackf77_zlarft, magma_bulge_findVTAUTpos(), magma_bulge_get_blkcnt(), magma_ceildiv(), min, T, TAU, and V.

1077 {
1078  //%===========================
1079  //% local variables
1080  //%===========================
1081  magma_int_t firstcolj;
1082  magma_int_t rownbm;
1083  magma_int_t st,ed,fst,vlen,vnb,colj;
1084  magma_int_t blkid,vpos,taupos,tpos;
1085  magma_int_t blkpercore, myid;
1086 
1087  if(n<=0)
1088  return ;
1089  magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
1090 
1091  blkpercore = blkcnt/cores_num;
1092 
1093  magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz);
1094 
1095  if(my_core_id==0) printf(" COMPUTE T parallel threads %d with N %d NB %d Vblksiz %d \n",cores_num,n,nb,Vblksiz);
1096 
1097  for (magma_int_t bg = nbGblk; bg>0; bg--)
1098  {
1099  firstcolj = (bg-1)*Vblksiz + 1;
1100  rownbm = magma_ceildiv(n-(firstcolj+1), nb);
1101  if(bg==nbGblk)
1102  rownbm = magma_ceildiv(n-firstcolj ,nb); // last blk has size=1 used for complex to handle A(N,N-1)
1103 
1104  for (magma_int_t m = rownbm; m>0; m--)
1105  {
1106  vlen = 0;
1107  vnb = 0;
1108  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
1109  fst = (rownbm -m)*nb+colj +1;
1110  for (magma_int_t k=0; k<Vblksiz; k++)
1111  {
1112  colj = (bg-1)*Vblksiz + k;
1113  st = (rownbm -m)*nb+colj +1;
1114  ed = min(st+nb-1,n-1);
1115  if(st>ed)
1116  break;
1117  if((st==ed)&&(colj!=n-2))
1118  break;
1119 
1120  vlen=ed-fst+1;
1121  vnb=k+1;
1122  }
1123  colj = (bg-1)*Vblksiz;
1124  magma_bulge_findVTAUTpos(n, nb, Vblksiz, colj, fst, ldv, ldt, &vpos, &taupos, &tpos, &blkid);
1125  myid = blkid/blkpercore;
1126  if(my_core_id==(myid%cores_num)){
1127  if((vlen>0)&&(vnb>0))
1128  lapackf77_zlarft( "F", "C", &vlen, &vnb, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt);
1129  }
1130  }
1131  }
1132 }
#define min(a, b)
Definition: common_magma.h:86
magma_int_t ldv
#define lapackf77_zlarft
Definition: magma_zlapack.h:80
#define T(m)
Definition: zgeqrf_mc.cpp:14
int magma_int_t
Definition: magmablas.h:12
#define TAU(m)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
void magma_bulge_findVTAUTpos(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t ldv, magma_int_t ldt, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *blkid)
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

static void tile_bulge_parallel ( magma_int_t  my_core_id,
magma_int_t  cores_num,
magmaDoubleComplex *  A,
magma_int_t  lda,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  nb_tiles,
magma_int_t  band,
magma_int_t  grsiz,
magma_int_t  Vblksiz,
volatile magma_int_t prog 
)
static

Definition at line 914 of file zhetrd_bhe2trc_v5.cpp.

References magma_ceildiv(), magma_free_cpu(), magma_zmalloc_cpu(), magma_ztrdtype1cbHLsym_withQ_v2(), magma_ztrdtype2cbHLsym_withQ_v2(), magma_ztrdtype3cbHLsym_withQ_v2(), and min.

917 {
918  magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind;
919  magma_int_t blklastind, colpt;
920  magma_int_t stepercol;
921  magma_int_t i,j,m,k;
922  magma_int_t thgrsiz, thgrnb, thgrid, thed;
923  magma_int_t coreid;
924  magma_int_t colblktile,maxrequiredcores,colpercore,mycoresnb;
925  magma_int_t fin;
926  magmaDoubleComplex *work;
927 
928  if(n<=0)
929  return ;
930 
931  //printf("=================> my core id %d of %d \n",my_core_id, cores_num);
932 
933  if((band!=0) && (band!=6) && (band!=62) && (band!=63)){
934  if(my_core_id==0)printf(" ===============================================================================\n");
935  if(my_core_id==0)printf(" ATTENTION ========> BAND is required to be 0 6 62 63, program will exit\n");
936  if(my_core_id==0)printf(" ===============================================================================\n");
937  return;
938  }
939  /* As I store V in the V vector there are overlap between
940  * tasks so shift is now 4 where group need to be always
941  * multiple of 2, because as example if grs=1 task 2 from
942  * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
943  * will overwrite the V of tasks 5 sweep 1 which are used by
944  * task 6, so keep in mind that group need to be multiple of 2,
945  * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
946  * However, when storing V in A, shift could be back to 3.
947  * */
948 
949  magma_zmalloc_cpu(&work, n);
950 
951  mycoresnb = cores_num;
952 
953  shift = 5;
954  if(grsiz==1)
955  colblktile=1;
956  else
957  colblktile=grsiz/2;
958 
959  maxrequiredcores = nbtiles/colblktile;
960  if(maxrequiredcores<1)maxrequiredcores=1;
961  colpercore = colblktile*nb;
962  if(mycoresnb > maxrequiredcores)
963  {
964  if(my_core_id==0)printf("==================================================================================\n");
965  if(my_core_id==0)printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores);
966  if(my_core_id==0)printf("==================================================================================\n");
967  mycoresnb = maxrequiredcores;
968  }
969  thgrsiz = n;
970 
971  if(my_core_id==0) printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d BAND %4d\n",cores_num, n, nb, grsiz,thgrsiz,band);
972 
973  stepercol = magma_ceildiv(shift, grsiz);
974 
975  thgrnb = magma_ceildiv(n-1, thgrsiz);
976 
977  for (thgrid = 1; thgrid<=thgrnb; thgrid++){
978  stt = (thgrid-1)*thgrsiz+1;
979  thed = min( (stt + thgrsiz -1), (n-1));
980  for (i = stt; i <= n-1; i++){
981  ed=min(i,thed);
982  if(stt>ed)break;
983  for (m = 1; m <=stepercol; m++){
984  st=stt;
985  for (sweepid = st; sweepid <=ed; sweepid++){
986 
987  for (k = 1; k <=grsiz; k++){
988  myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
989  if(myid%2 ==0){
990  colpt = (myid/2)*nb+1+sweepid-1;
991  stind = colpt-nb+1;
992  edind = min(colpt,n);
993  blklastind = colpt;
994  if(stind>=edind){
995  printf("ERROR---------> st>=ed %d %d \n\n",stind, edind);
996  exit(-10);
997  }
998  }else{
999  colpt = ((myid+1)/2)*nb + 1 +sweepid -1 ;
1000  stind = colpt-nb+1;
1001  edind = min(colpt,n);
1002  if( (stind>=edind-1) && (edind==n) )
1003  blklastind=n;
1004  else
1005  blklastind=0;
1006  if(stind>edind){
1007  printf("ERROR---------> st>=ed %d %d \n\n",stind, edind);
1008  exit(-10);
1009  }
1010  }
1011 
1012  coreid = (stind/colpercore)%mycoresnb;
1013 
1014  if(my_core_id==coreid)
1015  {
1016 
1017  fin=0;
1018  while(fin==0)
1019  {
1020  if(myid==1)
1021  {
1022  if( (prog[myid+shift-1]== (sweepid-1)) )
1023  {
1024  magma_ztrdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
1025 
1026  fin=1;
1027  prog[myid]= sweepid;
1028  if(blklastind >= (n-1))
1029  {
1030  for (j = 1; j <= shift; j++)
1031  prog[myid+j]=sweepid;
1032  }
1033  } // END progress condition
1034 
1035  }else{
1036  if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) )
1037  {
1038  if(myid%2 == 0)
1039  magma_ztrdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
1040  else
1041  magma_ztrdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
1042 
1043  fin=1;
1044  prog[myid]= sweepid;
1045  if(blklastind >= (n-1))
1046  {
1047  for (j = 1; j <= shift+mycoresnb; j++)
1048  prog[myid+j]=sweepid;
1049  }
1050  } // END progress condition
1051  } // END if myid==1
1052  } // END while loop
1053 
1054  } // END if my_core_id==coreid
1055 
1056  if(blklastind >= (n-1))
1057  {
1058  stt=stt+1;
1059  break;
1060  }
1061  } // END for k=1:grsiz
1062  } // END for sweepid=st:ed
1063  } // END for m=1:stepercol
1064  } // END for i=1:n-1
1065  } // END for thgrid=1:thgrnb
1066 
1067  magma_free_cpu(work);
1068 
1069 } // END FUNCTION
#define min(a, b)
Definition: common_magma.h:86
magma_int_t ldv
void magma_ztrdtype3cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
int magma_int_t
Definition: magmablas.h:12
void magma_ztrdtype1cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
#define TAU(m)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define A(i, j)
Definition: cprint.cpp:16
static magma_err_t magma_zmalloc_cpu(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:86
void magma_ztrdtype2cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
magma_err_t magma_free_cpu(void *ptr)
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function: