MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zhetrd_hb2st.cpp File Reference
#include "common_magma.h"
#include "magma_bulge.h"
#include "magma_zbulge.h"
#include <cblas.h>
Include dependency graph for zhetrd_hb2st.cpp:

Go to the source code of this file.

Data Structures

class  magma_zbulge_data
 
class  magma_zbulge_id_data
 

Macros

#define PRECISION_z
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 

Functions

static void * magma_zhetrd_hb2st_parallel_section (void *arg)
 
static void magma_ztile_bulge_parallel (magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog)
 
static void magma_ztile_bulge_computeT_parallel (magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
 
magma_int_t magma_zhetrd_hb2st (magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *A, magma_int_t lda, double *D, double *E, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t compT, magmaDoubleComplex *T, magma_int_t ldt)
 

Macro Definition Documentation

#define PRECISION_z

Definition at line 24 of file zhetrd_hb2st.cpp.

#define T (   m)    &(T[(m)])

Definition at line 645 of file zhetrd_hb2st.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 644 of file zhetrd_hb2st.cpp.

#define V (   m)    &(V[(m)])

Definition at line 643 of file zhetrd_hb2st.cpp.

Function Documentation

magma_int_t magma_zhetrd_hb2st ( magma_int_t  threads,
char  uplo,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  Vblksiz,
magmaDoubleComplex *  A,
magma_int_t  lda,
double *  D,
double *  E,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magma_int_t  compT,
magmaDoubleComplex *  T,
magma_int_t  ldt 
)

Definition at line 112 of file zhetrd_hb2st.cpp.

References magma_bulge_get_blkcnt(), magma_ceildiv(), magma_free_cpu(), magma_malloc_cpu(), magma_setlapack_numthreads(), MAGMA_SUCCESS, magma_wtime(), MAGMA_Z_REAL, magma_zhetrd_hb2st_parallel_section(), MagmaLower, pthread_attr_init(), pthread_attr_setscope(), pthread_create(), pthread_join(), PTHREAD_SCOPE_SYSTEM, and pthread_setconcurrency().

115 {
116 /* -- MAGMA (version 1.4.0) --
117  Univ. of Tennessee, Knoxville
118  Univ. of California, Berkeley
119  Univ. of Colorado, Denver
120  August 2013
121 
122  Purpose
123  =======
124 
125 
126  Arguments
127  =========
128  THREADS (input) INTEGER
129  Specifies the number of pthreads used.
130  THREADS > 0
131 
132  UPLO (input) CHARACTER*1
133  = 'U': Upper triangles of A is stored;
134  = 'L': Lower triangles of A is stored.
135 
136  N (input) INTEGER
137  The order of the matrix A. N >= 0.
138 
139  NB (input) INTEGER
140  The order of the band matrix A. N >= NB >= 0.
141 
142  VBLKSIZ (input) INTEGER
143  The size of the block of householder vectors applied at once.
144 
145  A (input/workspace) COMPLEX_16 array, dimension (LDA, N)
146  On entry the band matrix stored in the following way:
147 
148  LDA (input) INTEGER
149  The leading dimension of the array A. LDA >= 2*NB.
150 
151  D (output) DOUBLE array, dimension (N)
152  The diagonal elements of the tridiagonal matrix T:
153  D(i) = A(i,i).
154 
155  E (output) DOUBLE array, dimension (N-1)
156  The off-diagonal elements of the tridiagonal matrix T:
157  E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.
158 
159  V (output) COMPLEX_16 array, dimension (BLKCNT, LDV, VBLKSIZ)
160  On exit it contains the blocks of householder reflectors
161  BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT.
162 
163  LDV (input) INTEGER
164  The leading dimension of V.
165  LDV > NB + VBLKSIZ + 1
166 
167  TAU (output) COMPLEX_16 dimension(BLKCNT, VBLKSIZ)
168  ???
169 
170  COMPT (input) INTEGER
171  if COMPT = 0 T is not computed
172  if COMPT = 1 T is computed
173 
174  T (output) COMPLEX_16 dimension(LDT *)
175  if COMPT = 1 on exit contains the matrices T needed for Q2
176  if COMPT = 0 T is not referenced
177 
178  LDT (input) INTEGER
179  The leading dimension of T.
180  LDT > Vblksiz
181 
182  INFO (output) INTEGER ????????????????????????????????????????????????????????????????????????????????????
183  = 0: successful exit
184 
185 
186  ===================================================================== */
187 
188  #ifdef ENABLE_TIMER
189  real_Double_t timeblg=0.0;
190  #endif
191 
192  //char uplo_[2] = {uplo, 0};
193  magma_int_t mklth = threads;
194  magma_int_t INgrsiz=1;
195  magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
196  magma_int_t nbtiles = magma_ceildiv(n, nb);
197 
198  memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(magmaDoubleComplex));
199  memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex));
200  memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(magmaDoubleComplex));
201 
202  magma_int_t* prog;
203  magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t));
204  memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t));
205 
207  magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zbulge_id_data));
208 
209  pthread_t* thread_id;
210  magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
211  pthread_attr_t thread_attr;
212 
214  magma_zbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT,
215  A, lda, V, ldv, TAU, T, ldt, prog);
216 
217  // Set one thread per core
218  pthread_attr_init(&thread_attr);
220  pthread_setconcurrency(threads);
221 
222  //timing
223  #ifdef ENABLE_TIMER
224  timeblg = magma_wtime();
225  #endif
226 
227  // Launch threads
228  for (magma_int_t thread = 1; thread < threads; thread++)
229  {
230  arg[thread] = magma_zbulge_id_data(thread, &data_bulge);
231  pthread_create(&thread_id[thread], &thread_attr, magma_zhetrd_hb2st_parallel_section, &arg[thread]);
232  }
233  arg[0] = magma_zbulge_id_data(0, &data_bulge);
235 
236  // Wait for completion
237  for (magma_int_t thread = 1; thread < threads; thread++)
238  {
239  void *exitcodep;
240  pthread_join(thread_id[thread], &exitcodep);
241  }
242 
243  // timing
244  #ifdef ENABLE_TIMER
245  timeblg = magma_wtime()-timeblg;
246  printf(" time BULGE+T = %lf \n" ,timeblg);
247  #endif
248 
249  magma_free_cpu(thread_id);
250  magma_free_cpu(arg);
251  magma_free_cpu(prog);
252 
254  /*================================================
255  * store resulting diag and lower diag D and E
256  * note that D and E are always real
257  *================================================*/
258 
259  /* Make diagonal and superdiagonal elements real,
260  * storing them in D and E
261  */
262  /* In complex case, the off diagonal element are
263  * not necessary real. we have to make off-diagonal
264  * elements real and copy them to E.
265  * When using HouseHolder elimination,
266  * the ZLARFG give us a real as output so, all the
267  * diagonal/off-diagonal element except the last one are already
268  * real and thus we need only to take the abs of the last
269  * one.
270  * */
271 
272 #if defined(PRECISION_z) || defined(PRECISION_c)
273  if(uplo==MagmaLower){
274  for (magma_int_t i=0; i < n-1 ; i++)
275  {
276  D[i] = MAGMA_Z_REAL(A[i*lda ]);
277  E[i] = MAGMA_Z_REAL(A[i*lda+1]);
278  }
279  D[n-1] = MAGMA_Z_REAL(A[(n-1)*lda]);
280  } else { /* MagmaUpper not tested yet */
281  for (magma_int_t i=0; i<n-1; i++)
282  {
283  D[i] = MAGMA_Z_REAL(A[i*lda+nb]);
284  E[i] = MAGMA_Z_REAL(A[i*lda+nb-1]);
285  }
286  D[n-1] = MAGMA_Z_REAL(A[(n-1)*lda+nb]);
287  } /* end MagmaUpper */
288 #else
289  if( uplo == MagmaLower ){
290  for (magma_int_t i=0; i < n-1; i++) {
291  D[i] = A[i*lda]; // diag
292  E[i] = A[i*lda+1]; //lower diag
293  }
294  D[n-1] = A[(n-1)*lda];
295  } else {
296  for (magma_int_t i=0; i < n-1; i++) {
297  D[i] = A[i*lda+nb]; // diag
298  E[i] = A[i*lda+nb-1]; //lower diag
299  }
300  D[n-1] = A[(n-1)*lda+nb];
301  }
302 #endif
303  return MAGMA_SUCCESS;
304 
305 }
MAGMA_DLLPORT int MAGMA_CDECL pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start)(void *), void *arg)
magma_int_t ldv
#define V(m)
MAGMA_DLLPORT int MAGMA_CDECL pthread_join(pthread_t thread, void **value_ptr)
int magma_int_t
Definition: magmablas.h:12
magma_err_t magma_malloc_cpu(void **ptrPtr, size_t bytes)
#define T(m)
#define PTHREAD_SCOPE_SYSTEM
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_setscope(pthread_attr_t *attr, int scope)
#define MagmaLower
Definition: magma.h:62
void magma_setlapack_numthreads(magma_int_t num_threads)
static void * magma_zhetrd_hb2st_parallel_section(void *arg)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
#define TAU(m)
double magma_wtime(void)
Definition: timer.cpp:110
#define A(i, j)
Definition: cprint.cpp:16
#define MAGMA_SUCCESS
Definition: magma.h:106
#define E(m, n)
MAGMA_DLLPORT int MAGMA_CDECL pthread_setconcurrency(int level)
double real_Double_t
Definition: magma_types.h:27
magma_err_t magma_free_cpu(void *ptr)
#define MAGMA_Z_REAL(a)
Definition: magma.h:124
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_init(pthread_attr_t *attr)
int pthread_attr_t

Here is the call graph for this function:

Here is the caller graph for this function:

static void * magma_zhetrd_hb2st_parallel_section ( void *  arg)
static

Definition at line 309 of file zhetrd_hb2st.cpp.

References A, barrier(), magma_setlapack_numthreads(), magma_wtime(), magma_ztile_bulge_computeT_parallel(), magma_ztile_bulge_parallel(), T, TAU, and V.

310 {
311  magma_int_t my_core_id = ((magma_zbulge_id_data*)arg) -> id;
312  magma_zbulge_data* data = ((magma_zbulge_id_data*)arg) -> data;
313 
314  magma_int_t allcores_num = data -> threads_num;
315  magma_int_t n = data -> n;
316  magma_int_t nb = data -> nb;
317  magma_int_t nbtiles = data -> nbtiles;
318  magma_int_t grsiz = data -> grsiz;
319  magma_int_t Vblksiz = data -> Vblksiz;
320  magma_int_t compT = data -> compT;
321  magmaDoubleComplex *A = data -> A;
322  magma_int_t lda = data -> lda;
323  magmaDoubleComplex *V = data -> V;
324  magma_int_t ldv = data -> ldv;
325  magmaDoubleComplex *TAU = data -> TAU;
326  magmaDoubleComplex *T = data -> T;
327  magma_int_t ldt = data -> ldt;
328  volatile magma_int_t* prog = data -> prog;
329 
330  pthread_barrier_t* barrier = &(data -> barrier);
331 
332  //magma_int_t sys_corenbr = 1;
333 
334  #ifdef ENABLE_TIMER
335  real_Double_t timeB=0.0, timeT=0.0;
336  #endif
337 
338  // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
339  // it need that all threads setting it to 1.
341 
342 #ifdef MAGMA_SETAFFINITY
343 //#define PRINTAFFINITY
344 #ifdef PRINTAFFINITY
345  affinity_set print_set;
346  print_set.print_affinity(my_core_id, "starting affinity");
347 #endif
348  affinity_set original_set;
349  affinity_set new_set(my_core_id);
350  int check = 0;
351  int check2 = 0;
352  // bind threads
353  check = original_set.get_affinity();
354  if (check == 0) {
355  check2 = new_set.set_affinity();
356  if (check2 != 0)
357  printf("Error in sched_setaffinity (single cpu)\n");
358  }
359  else
360  {
361  printf("Error in sched_getaffinity\n");
362  }
363 #ifdef PRINTAFFINITY
364  print_set.print_affinity(my_core_id, "set affinity");
365 #endif
366 #endif
367 
368  if(compT==1)
369  {
370  /* compute the Q1 overlapped with the bulge chasing+T.
371  * if all_cores_num=1 it call Q1 on GPU and then bulgechasing.
372  * otherwise the first thread run Q1 on GPU and
373  * the other threads run the bulgechasing.
374  * */
375 
376  if(allcores_num==1)
377  {
378 
379  //=========================
380  // bulge chasing
381  //=========================
382  #ifdef ENABLE_TIMER
383  timeB = magma_wtime();
384  #endif
385 
386  magma_ztile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);
387 
388  #ifdef ENABLE_TIMER
389  timeB = magma_wtime()-timeB;
390  printf(" Finish BULGE timing= %lf \n" ,timeB);
391  #endif
392  //=========================
393  // compute the T's to be used when applying Q2
394  //=========================
395  #ifdef ENABLE_TIMER
396  timeT = magma_wtime();
397  #endif
398 
399  magma_ztile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz);
400 
401  #ifdef ENABLE_TIMER
402  timeT = magma_wtime()-timeT;
403  printf(" Finish T's timing= %lf \n" ,timeT);
404  #endif
405 
406  }else{ // allcore_num > 1
407 
408  magma_int_t id = my_core_id;
409  magma_int_t tot = allcores_num;
410 
411 
412  //=========================
413  // bulge chasing
414  //=========================
415  #ifdef ENABLE_TIMER
416  if(id == 0)
417  timeB = magma_wtime();
418  #endif
419 
420  magma_ztile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);
421  pthread_barrier_wait(barrier);
422 
423  #ifdef ENABLE_TIMER
424  if(id == 0){
425  timeB = magma_wtime()-timeB;
426  printf(" Finish BULGE timing= %lf \n" ,timeB);
427  }
428  #endif
429 
430  //=========================
431  // compute the T's to be used when applying Q2
432  //=========================
433  #ifdef ENABLE_TIMER
434  if(id == 0)
435  timeT = magma_wtime();
436  #endif
437 
438  magma_ztile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz);
439  pthread_barrier_wait(barrier);
440 
441  #ifdef ENABLE_TIMER
442  if (id == 0){
443  timeT = magma_wtime()-timeT;
444  printf(" Finish T's timing= %lf \n" ,timeT);
445  }
446  #endif
447 
448  } // allcore == 1
449 
450  }else{ // WANTZ = 0
451 
452  //=========================
453  // bulge chasing
454  //=========================
455  #ifdef ENABLE_TIMER
456  if(my_core_id == 0)
457  timeB = magma_wtime();
458  #endif
459 
460  magma_ztile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog);
461  pthread_barrier_wait(barrier);
462 
463  #ifdef ENABLE_TIMER
464  if(my_core_id == 0){
465  timeB = magma_wtime()-timeB;
466  printf(" Finish BULGE timing= %lf \n" ,timeB);
467  }
468  #endif
469 
470  } // WANTZ > 0
471 
472 #ifdef MAGMA_SETAFFINITY
473  // unbind threads
474  if (check == 0){
475  check2 = original_set.set_affinity();
476  if (check2 != 0)
477  printf("Error in sched_setaffinity (restore cpu list)\n");
478  }
479 #ifdef PRINTAFFINITY
480  print_set.print_affinity(my_core_id, "restored_affinity");
481 #endif
482 #endif
483 
484  return 0;
485 }
magma_int_t ldv
#define V(m)
int magma_int_t
Definition: magmablas.h:12
static void barrier(int my_core_id, int cores_num)
#define T(m)
void magma_setlapack_numthreads(magma_int_t num_threads)
#define TAU(m)
static void magma_ztile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog)
double magma_wtime(void)
Definition: timer.cpp:110
#define A(i, j)
Definition: cprint.cpp:16
static void magma_ztile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
double real_Double_t
Definition: magma_types.h:27

Here is the call graph for this function:

Here is the caller graph for this function:

static void magma_ztile_bulge_computeT_parallel ( magma_int_t  my_core_id,
magma_int_t  cores_num,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magmaDoubleComplex *  T,
magma_int_t  ldt,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  Vblksiz 
)
static

Definition at line 646 of file zhetrd_hb2st.cpp.

References lapackf77_zlarft, magma_bulge_findVTAUTpos(), magma_bulge_get_blkcnt(), magma_ceildiv(), min, T, TAU, and V.

648 {
649  //%===========================
650  //% local variables
651  //%===========================
652  magma_int_t Vm, Vn, mt, nt;
653  magma_int_t myrow, mycol, blkj, blki, firstrow;
654  magma_int_t blkid,vpos,taupos,tpos;
655  magma_int_t blkpercore, myid;
656 
657  if(n<=0)
658  return ;
659 
660  magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
661  blkpercore = blkcnt/cores_num;
662  blkpercore = blkpercore==0 ? 1:blkpercore;
663  //magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz);
664 
665  #ifdef ENABLE_DEBUG
666  if(my_core_id==0)
667  printf(" COMPUTE T parallel threads %d with N %d NB %d Vblksiz %d \n",cores_num,n,nb,Vblksiz);
668  #endif
669 
670 
671 
672  /*========================================
673  * compute the T's in parallel.
674  * The Ts are independent so each core pick
675  * a T and compute it. The loop is based on
676  * the version 113 of the applyQ
677  * which go over the losange block_column
678  * by block column. but it is not important
679  * here the order because Ts are independent.
680  * ========================================
681  */
682  nt = magma_ceildiv((n-1),Vblksiz);
683  for (blkj=nt-1; blkj>=0; blkj--) {
684  /* the index of the first row on the top of block (blkj) */
685  firstrow = blkj * Vblksiz + 1;
686  /*find the number of tile for this block */
687  if( blkj == nt-1 )
688  mt = magma_ceildiv( n - firstrow, nb);
689  else
690  mt = magma_ceildiv( n - (firstrow+1), nb);
691  /*loop over the tiles find the size of the Vs and apply it */
692  for (blki=mt; blki>0; blki--) {
693  /*calculate the size of each losange of Vs= (Vm,Vn)*/
694  myrow = firstrow + (mt-blki)*nb;
695  mycol = blkj*Vblksiz;
696  Vm = min( nb+Vblksiz-1, n-myrow);
697  if( ( blkj == nt-1 ) && ( blki == mt ) ){
698  Vn = min (Vblksiz, Vm);
699  } else {
700  Vn = min (Vblksiz, Vm-1);
701  }
702  /*calculate the pointer to the Vs and the Ts.
703  * Note that Vs and Ts have special storage done
704  * by the bulgechasing function*/
705  magma_bulge_findVTAUTpos(n, nb, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &taupos, &tpos, &blkid);
706  myid = blkid/blkpercore;
707  if( my_core_id==(myid%cores_num) ){
708  if( ( Vm > 0 ) && ( Vn > 0 ) ){
709  lapackf77_zlarft( "F", "C", &Vm, &Vn, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt);
710  }
711  }
712  }
713  }
714 }
#define min(a, b)
Definition: common_magma.h:86
magma_int_t ldv
#define lapackf77_zlarft
Definition: magma_zlapack.h:80
#define V(m)
int magma_int_t
Definition: magmablas.h:12
#define T(m)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
#define TAU(m)
void magma_bulge_findVTAUTpos(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t ldv, magma_int_t ldt, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *blkid)

Here is the call graph for this function:

Here is the caller graph for this function:

static void magma_ztile_bulge_parallel ( magma_int_t  my_core_id,
magma_int_t  cores_num,
magmaDoubleComplex *  A,
magma_int_t  lda,
magmaDoubleComplex *  V,
magma_int_t  ldv,
magmaDoubleComplex *  TAU,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  nbtiles,
magma_int_t  grsiz,
magma_int_t  Vblksiz,
volatile magma_int_t prog 
)
static

Definition at line 489 of file zhetrd_hb2st.cpp.

References magma_ceildiv(), magma_free_cpu(), magma_zmalloc_cpu(), magma_ztrdtype1cbHLsym_withQ_v2(), magma_ztrdtype2cbHLsym_withQ_v2(), magma_ztrdtype3cbHLsym_withQ_v2(), and min.

492 {
493  magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind;
494  magma_int_t blklastind, colpt;
495  magma_int_t stepercol;
496  magma_int_t i,j,m,k;
497  magma_int_t thgrsiz, thgrnb, thgrid, thed;
498  magma_int_t coreid;
499  magma_int_t colblktile,maxrequiredcores,colpercore,mycoresnb;
500  magma_int_t fin;
501  magmaDoubleComplex *work;
502 
503  if(n<=0)
504  return ;
505  if(grsiz<=0)
506  return ;
507 
508  //printf("=================> my core id %d of %d \n",my_core_id, cores_num);
509 
510  /* As I store V in the V vector there are overlap between
511  * tasks so shift is now 4 where group need to be always
512  * multiple of 2, because as example if grs=1 task 2 from
513  * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
514  * will overwrite the V of tasks 5 sweep 1 which are used by
515  * task 6, so keep in mind that group need to be multiple of 2,
516  * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
517  * However, when storing V in A, shift could be back to 3.
518  * */
519 
520  magma_zmalloc_cpu(&work, n);
521  mycoresnb = cores_num;
522 
523  shift = 5;
524  if(grsiz==1)
525  colblktile=1;
526  else
527  colblktile=grsiz/2;
528 
529  maxrequiredcores = nbtiles/colblktile;
530  if(maxrequiredcores<1)maxrequiredcores=1;
531  colpercore = colblktile*nb;
532  if(mycoresnb > maxrequiredcores)
533  mycoresnb = maxrequiredcores;
534  thgrsiz = n;
535  stepercol = magma_ceildiv(shift, grsiz);
536  thgrnb = magma_ceildiv(n-1, thgrsiz);
537 
538  #ifdef ENABLE_DEBUG
539  if(my_core_id==0){
540  if(cores_num > maxrequiredcores) {
541  printf("==================================================================================\n");
542  printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores);
543  printf("==================================================================================\n");
544  }
545  printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d \n",cores_num, n, nb, grsiz,thgrsiz);
546  }
547  #endif
548 
549  for (thgrid = 1; thgrid<=thgrnb; thgrid++){
550  stt = (thgrid-1)*thgrsiz+1;
551  thed = min( (stt + thgrsiz -1), (n-1));
552  for (i = stt; i <= n-1; i++){
553  ed=min(i,thed);
554  if(stt>ed)break;
555  for (m = 1; m <=stepercol; m++){
556  st=stt;
557  for (sweepid = st; sweepid <=ed; sweepid++){
558 
559  for (k = 1; k <=grsiz; k++){
560  myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
561  if(myid%2 ==0){
562  colpt = (myid/2)*nb+1+sweepid-1;
563  stind = colpt-nb+1;
564  edind = min(colpt,n);
565  blklastind = colpt;
566  if(stind>=edind){
567  printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind);
568  exit(-10);
569  }
570  }else{
571  colpt = ((myid+1)/2)*nb + 1 +sweepid -1 ;
572  stind = colpt-nb+1;
573  edind = min(colpt,n);
574  if( (stind>=edind-1) && (edind==n) )
575  blklastind=n;
576  else
577  blklastind=0;
578  if(stind>edind){
579  printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind);
580  exit(-10);
581  }
582  }
583 
584  coreid = (stind/colpercore)%mycoresnb;
585 
586  if(my_core_id==coreid)
587  {
588  fin=0;
589  while(fin==0)
590  {
591  if(myid==1)
592  {
593  if( (prog[myid+shift-1]== (sweepid-1)) )
594  {
595  magma_ztrdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
596 
597  fin=1;
598  prog[myid]= sweepid;
599  if(blklastind >= (n-1))
600  {
601  for (j = 1; j <= shift; j++)
602  prog[myid+j]=sweepid;
603  }
604  } // END progress condition
605 
606  }else{
607  if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) )
608  {
609  if(myid%2 == 0)
610  magma_ztrdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
611  else
612  magma_ztrdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
613 
614  fin=1;
615  prog[myid]= sweepid;
616  if(blklastind >= (n-1))
617  {
618  for (j = 1; j <= shift+mycoresnb; j++)
619  prog[myid+j]=sweepid;
620  }
621  } // END progress condition
622  } // END if myid==1
623  } // END while loop
624 
625  } // END if my_core_id==coreid
626 
627  if(blklastind >= (n-1))
628  {
629  stt=stt+1;
630  break;
631  }
632  } // END for k=1:grsiz
633  } // END for sweepid=st:ed
634  } // END for m=1:stepercol
635  } // END for i=1:n-1
636  } // END for thgrid=1:thgrnb
637 
638  magma_free_cpu(work);
639 
640 } // END FUNCTION
#define min(a, b)
Definition: common_magma.h:86
magma_int_t ldv
void magma_ztrdtype3cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
#define V(m)
int magma_int_t
Definition: magmablas.h:12
void magma_ztrdtype1cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define TAU(m)
#define A(i, j)
Definition: cprint.cpp:16
static magma_err_t magma_zmalloc_cpu(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:86
void magma_ztrdtype2cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, magmaDoubleComplex *work)
magma_err_t magma_free_cpu(void *ptr)

Here is the call graph for this function:

Here is the caller graph for this function: