MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zhetrd_bhe2trc_v3.cpp File Reference
#include "common_magma.h"
#include "magma_zbulgeinc.h"
Include dependency graph for zhetrd_bhe2trc_v3.cpp:

Go to the source code of this file.

Macros

#define PRECISION_z
 
#define CHECK   0
 
#define LOGQ   0
 
#define LOG   0
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 
#define E(m, n)   &(E[(m) + LDE*(n)])
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 
#define E(m, n)   &(E[(m) + LDE*(n)])
 
#define V(m)   &(V[(m)])
 
#define TAU(m)   &(TAU[(m)])
 
#define T(m)   &(T[(m)])
 

Functions

void cmp_vals (magma_int_t n, double *wr1, double *wr2, double *nrmI, double *nrm1, double *nrm2)
 
void zcheck_eig_ (char *JOBZ, magma_int_t *MATYPE, magma_int_t *N, magma_int_t *NB, magmaDoubleComplex *A, magma_int_t *LDA, double *AD, double *AE, double *D1, double *EIG, magmaDoubleComplex *Z, magma_int_t *LDZ, magmaDoubleComplex *WORK, double *RWORK, double *RESU)
 
magma_int_t plasma_ceildiv (magma_int_t a, magma_int_t b)
 
void tile_bulge_applyQ_cpu (magma_int_t WANTZ, char SIDE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO)
 
static void barrier (int my_core_id, int cores_num)
 
static void * parallel_section (void *thread_id)
 
static void * applyQ_parallel_section (void *thread_id)
 
static void tile_bulge_parallel (int my_core_id)
 
static void tile_bulge_computeT_parallel (int my_core_id)
 
static void tile_bulge_applyQ_parallel (int my_core_id)
 
static void tile_bulge_applyQ_parallel2 (int my_core_id)
 
void magma_ztrdtype1cbHLsym_withQ (magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
 
void magma_ztrdtype2cbHLsym_withQ (magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
 
void magma_ztrdtype3cbHLsym_withQ (magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
 
void magma_zbulge_applyQ (magma_int_t WANTZ, char SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT, magmaDoubleComplex *dE, magma_int_t copytype)
 
void magma_zstedc_withZ (char JOBZ, magma_int_t N, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
 
void magma_zstedx_withZ (magma_int_t N, magma_int_t NE, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
 
magma_int_t magma_zungqr_2stage_gpu (magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
 
int event_numblg[MAX_THREADS_BLG__attribute__ ((aligned(128)))
 
magma_int_t magma_zhetrd_bhe2trc (magma_int_t THREADS, magma_int_t WANTZ, char uplo, magma_int_t NE, magma_int_t N, magma_int_t NB, magmaDoubleComplex *A1, magma_int_t LDA1, double *D2, double *E2, magmaDoubleComplex *dT1, magma_int_t LDT1)
 

Variables

volatile int barrier_in [MAX_THREADS_BLG]
 
volatile int barrier_out [MAX_THREADS_BLG]
 
volatile int * ss_prog
 
struct gbstrct_blg core_in_all
 
int log_eventsblg
 

Macro Definition Documentation

#define CHECK   0

Definition at line 19 of file zhetrd_bhe2trc_v3.cpp.

#define E (   m,
 
)    &(E[(m) + LDE*(n)])

Definition at line 2116 of file zhetrd_bhe2trc_v3.cpp.

#define E (   m,
 
)    &(E[(m) + LDE*(n)])

Definition at line 2116 of file zhetrd_bhe2trc_v3.cpp.

#define LOG   0

Definition at line 21 of file zhetrd_bhe2trc_v3.cpp.

#define LOGQ   0

Definition at line 20 of file zhetrd_bhe2trc_v3.cpp.

#define PRECISION_z

Definition at line 17 of file zhetrd_bhe2trc_v3.cpp.

#define T (   m)    &(T[(m)])

Definition at line 2119 of file zhetrd_bhe2trc_v3.cpp.

#define T (   m)    &(T[(m)])

Definition at line 2119 of file zhetrd_bhe2trc_v3.cpp.

#define T (   m)    &(T[(m)])

Definition at line 2119 of file zhetrd_bhe2trc_v3.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 2118 of file zhetrd_bhe2trc_v3.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 2118 of file zhetrd_bhe2trc_v3.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 2118 of file zhetrd_bhe2trc_v3.cpp.

#define V (   m)    &(V[(m)])

Definition at line 2117 of file zhetrd_bhe2trc_v3.cpp.

#define V (   m)    &(V[(m)])

Definition at line 2117 of file zhetrd_bhe2trc_v3.cpp.

#define V (   m)    &(V[(m)])

Definition at line 2117 of file zhetrd_bhe2trc_v3.cpp.

Function Documentation

int event_numblg [MAX_THREADS_BLG] __attribute__ ( (aligned(128))  )
static void * applyQ_parallel_section ( void *  thread_id)
static

Definition at line 1310 of file zhetrd_bhe2trc_v3.cpp.

References barrier(), core_event_endblg, core_event_startblg, core_in_all, core_log_eventblg, gbstrct_blg::cores_num, gbstrct_blg::dE, gbstrct_blg::dT2, gbstrct_blg::dV2, gbstrct_blg::E, gbstrct_blg::LDE, gbstrct_blg::locores_num, magma_device_sync(), magma_wtime(), magma_zbulge_applyQ(), gbstrct_blg::N, gbstrct_blg::N_CPU, gbstrct_blg::N_GPU, gbstrct_blg::NB, gbstrct_blg::NE, sys_corenbr, gbstrct_blg::T, gbstrct_blg::TAU, tile_bulge_applyQ_parallel(), gbstrct_blg::usemulticpu, gbstrct_blg::V, gbstrct_blg::Vblksiz, gbstrct_blg::WANTZ, and Z.

1311 {
1312  int my_core_id = *((int*)thread_id);
1313  int allcores_num = core_in_all.cores_num;
1314  int locores_num = core_in_all.locores_num;
1315  int usemulticpu = core_in_all.usemulticpu;
1316  magmaDoubleComplex *dZ = core_in_all.dE;
1317  magmaDoubleComplex *dT2 = core_in_all.dT2;
1318  magmaDoubleComplex *dV2 = core_in_all.dV2;
1319  magmaDoubleComplex *Z = core_in_all.E;
1320  magmaDoubleComplex *T2 = core_in_all.T;
1321  magmaDoubleComplex *V2 = core_in_all.V;
1322  magmaDoubleComplex *TAU2= core_in_all.TAU;
1323  int LDZ = core_in_all.LDE;
1324  int NE = core_in_all.NE;
1325  int N_CPU = core_in_all.N_CPU;
1326  int N_GPU = core_in_all.N_GPU;
1327  int N = core_in_all.N;
1328  int NB = core_in_all.NB;
1329  int Vblksiz = core_in_all.Vblksiz;
1330  int WANTZ = core_in_all.WANTZ;
1331  static volatile int sys_corenbr = 1;
1332  magma_int_t INFO, i, my_newcore_id, lastcoreid;
1333 
1334  real_Double_t timeQcpu=0.0, timeQgpu=0.0;
1335  magmaDoubleComplex *NOTUSED;
1336  int INOTUSED=0;
1337 
1338 
1339  if(WANTZ<=0)
1340  return 0;
1341 
1342 
1343 #if defined(MAGMA_SETAFFINITY)
1344  cpu_set_t set;
1345  CPU_ZERO( &set );
1346  CPU_SET( my_core_id, &set );
1347  sched_setaffinity( 0, sizeof(set), &set) ;
1348 #endif
1349 
1350 
1351  log_eventsblg = 1;
1352  core_event_startblg(my_core_id);
1353  barrier(my_core_id, allcores_num);
1354  core_event_endblg(my_core_id);
1355  core_log_eventblg(0x000000, my_core_id);
1356 
1357 
1358 /*################################################
1359  * WANTZ == 2
1360  *################################################*/
1361  if((WANTZ==2))
1362  {
1363  /************************************************
1364  * only one core is running ==> code is sequential
1365  * usually code should not come here it is better
1366  * when we have only 1 cpu to just run the gpu
1367  * from the main function
1368  ************************************************/
1369  if(allcores_num==1)
1370  {
1371  my_newcore_id = my_core_id;
1372  //=========================
1373  // apply Q2
1374  //=========================
1375  if(usemulticpu==1){
1376  timeQgpu = magma_wtime();
1377  // here dZ is da and Z=Q1
1378  magma_zbulge_applyQ(WANTZ, 'R', NE, N, NB, Vblksiz, NOTUSED, N, V2, TAU2, T2, &INFO, dV2, dT2, dZ, 2);
1380  timeQgpu = magma_wtime()-timeQgpu;
1381  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1382  }
1383  /************************************************
1384  * more than one core
1385  ************************************************/
1386  }else if(usemulticpu==1){
1387  lastcoreid = allcores_num-1;
1388  // the the coreid "0" becomes last_one allcores_num-1
1389  // and "1" becomes "0" and so on
1390  if(my_core_id==0){
1391  my_newcore_id = lastcoreid; // giving it last core id
1392  }else{
1393  my_newcore_id = my_core_id-1;
1394  }
1395 
1396 
1397  /* I am the last core in the new indexing and the original core=0 */
1398  if(my_newcore_id==lastcoreid)
1399  {
1400  //=============================================
1401  // on GPU on last_newcoreid:
1402  // - apply V2*Z(:,1:N_GPU)
1403  //=============================================
1404  timeQgpu = magma_wtime();
1405  magma_zbulge_applyQ(WANTZ, 'R', N_GPU, N, NB, Vblksiz, NOTUSED, N, V2, TAU2, T2, &INFO, dV2, dT2, dZ, 2);
1407  timeQgpu = magma_wtime()-timeQgpu;
1408  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1409  /* I am one of the remaining cores*/
1410  }else if(N_CPU>0){
1411  //=============================================
1412  // on CPU on core 1:last_newcoreid-1
1413  // - apply V2*Z(:,N_GPU+1:N)
1414  //=============================================
1415  if(my_newcore_id == 0)timeQcpu = magma_wtime();
1416  tile_bulge_applyQ_parallel(my_newcore_id);
1417  barrier(my_newcore_id, locores_num);
1418  if(my_newcore_id == 0){
1419  timeQcpu = magma_wtime()-timeQcpu;
1420  printf(" Finish Q2_CPU CCC timing= %lf \n" ,timeQcpu);
1421  }
1422 
1423  } // END if my_newcore_id==allcores_num-1
1424 
1425  } // END of more than one core
1426 
1427  }// END of WANTZ==3
1428 
1429 
1430  /*################################################
1431  * WANTZ > 0
1432  *################################################*/
1433  if((WANTZ==3))
1434  {
1435  /************************************************
1436  * only one core is running ==> code is sequential
1437  * usually code should not come here it is better
1438  * when we have only 1 cpu to just run the gpu
1439  * from the main function
1440  ************************************************/
1441  if(allcores_num==1)
1442  {
1443  my_newcore_id = my_core_id;
1444  //=========================
1445  // apply Q2
1446  //=========================
1447  if(usemulticpu==1){
1448  timeQgpu = magma_wtime();
1449  magma_zbulge_applyQ(WANTZ, 'L', NE, N, NB, Vblksiz, Z, LDZ, V2, TAU2, T2, &INFO, dV2, dT2, dZ, 3);
1451  timeQgpu = magma_wtime()-timeQgpu;
1452  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1453  }
1454  /************************************************
1455  * more than one core
1456  ************************************************/
1457  }else if(usemulticpu==1){
1458  lastcoreid = allcores_num-1;
1459  // the the coreid "0" becomes last_one allcores_num-1
1460  // and "1" becomes "0" and so on
1461  if(my_core_id==0){
1462  my_newcore_id = lastcoreid; // giving it last core id
1463  }else{
1464  my_newcore_id = my_core_id-1;
1465  }
1466 
1467 
1468  /* I am the last core in the new indexing and the original core=0 */
1469  if(my_newcore_id==lastcoreid)
1470  {
1471  //=============================================
1472  // on GPU on last_newcoreid:
1473  // - apply V2*Z(:,1:N_GPU)
1474  //=============================================
1475  timeQgpu = magma_wtime();
1476  magma_zbulge_applyQ(WANTZ, 'L', N_GPU, N, NB, Vblksiz, Z, LDZ, V2, TAU2, T2, &INFO, dV2, dT2, dZ, 3);
1478  timeQgpu = magma_wtime()-timeQgpu;
1479  printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu);
1480  /* I am one of the remaining cores*/
1481  }else if(N_CPU>0){
1482  //=============================================
1483  // on CPU on core 1:last_newcoreid-1
1484  // - apply V2*Z(:,N_GPU+1:N)
1485  //=============================================
1486  if(my_newcore_id == 0)timeQcpu = magma_wtime();
1487  tile_bulge_applyQ_parallel(my_newcore_id);
1488  barrier(my_newcore_id, locores_num);
1489  if(my_newcore_id == 0){
1490  timeQcpu = magma_wtime()-timeQcpu;
1491  printf(" Finish Q2_CPU CCC timing= %lf \n" ,timeQcpu);
1492  }
1493 
1494  } // END if my_newcore_id==allcores_num-1
1495 
1496  } // END of more than one core
1497 
1498  }// END of WANTZ==3
1499 
1500 
1501 
1502  // put a barrier on all the cores to be sure that
1503  // both [1:cores_num-1] working with CPU_code and
1504  // last_cores_num working with GPU have finish.
1505  barrier(my_core_id, allcores_num);
1506 
1507 
1508 
1509 #if defined(MAGMA_SETAFFINITY)
1510  // unbind threads
1511  sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
1512  CPU_ZERO( &set );
1513  for(i=0; i<sys_corenbr; i++)
1514  CPU_SET( i, &set );
1515  sched_setaffinity( 0, sizeof(set), &set) ;
1516 #endif
1517 
1518  return 0;
1519 }
void magma_device_sync()
void magma_zbulge_applyQ(magma_int_t WANTZ, char SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT, magmaDoubleComplex *dE, magma_int_t copytype)
#define core_event_startblg(my_core_id)
magmaFloatComplex * TAU
static void tile_bulge_applyQ_parallel(int my_core_id)
struct gbstrct_blg core_in_all
int magma_int_t
Definition: magmablas.h:12
static void barrier(int my_core_id, int cores_num)
#define core_log_eventblg(event, my_core_id)
magmaFloatComplex * dT2
#define Z(ix, iy)
Definition: dstedx.cpp:14
int log_eventsblg
magmaFloatComplex * E
#define core_event_endblg(my_core_id)
magmaFloatComplex * V
magmaFloatComplex * T
double magma_wtime(void)
Definition: timer.cpp:110
magmaFloatComplex * dE
double real_Double_t
Definition: magma_types.h:27
static volatile int sys_corenbr
Definition: quarkos.c:68
magmaFloatComplex * dV2

Here is the call graph for this function:

Here is the caller graph for this function:

static void barrier ( int  my_core_id,
int  cores_num 
)
static

Definition at line 98 of file zhetrd_bhe2trc_v3.cpp.

References gbstrct_blg::cores_num.

99 {
100  int core;
101 
102  if (my_core_id == 0) {
103  for (core = 1; core < cores_num; core++)
104  while (barrier_in[core] == 0);
105 
106  for (core = 1; core < cores_num; core++)
107  barrier_in[core] = 0;
108 
109  for (core = 1; core < cores_num; core++)
110  barrier_out[core] = 1;
111  }
112  else
113  {
114  barrier_in[my_core_id] = 1;
115  while (barrier_out[my_core_id] == 0);
116  barrier_out[my_core_id] = 0;
117  }
118 }
volatile int barrier_out[MAX_THREADS_BLG]
volatile int barrier_in[MAX_THREADS_BLG]

Here is the caller graph for this function:

void cmp_vals ( magma_int_t  n,
double *  wr1,
double *  wr2,
double *  nrmI,
double *  nrm1,
double *  nrm2 
)

Definition at line 62 of file bulge_auxiliary.cpp.

63  {
64  int i;
65  double curv, maxv, sumv;
66 
67  maxv = 0.0;
68  sumv = 0.0;
69  for (i = 0; i < n; ++i) {
70 
71  curv = fabs( wr1[i] - wr2[i]);
72  sumv += curv;
73  if (maxv < curv) maxv = curv;
74  }
75 
76  *nrmI = maxv;
77  *nrm1 = sumv;
78  *nrm2 = sqrt( sumv );
79  }
void magma_zbulge_applyQ ( magma_int_t  WANTZ,
char  SIDE,
magma_int_t  NE,
magma_int_t  N,
magma_int_t  NB,
magma_int_t  Vblksiz,
magmaDoubleComplex *  E,
magma_int_t  LDE,
magmaDoubleComplex *  V,
magmaDoubleComplex *  TAU,
magmaDoubleComplex *  T,
magma_int_t INFO,
magmaDoubleComplex *  dV,
magmaDoubleComplex *  dT,
magmaDoubleComplex *  dE,
magma_int_t  copytype 
)

Definition at line 31 of file zbulge_applyQ.cpp.

References dE, dT, dV, dwork, findVTpos(), findVTsiz(), magma_queue_create, magma_queue_destroy, MAGMA_SUCCESS, magma_zlarfb_gpu(), magma_zmalloc(), magma_zsetmatrix, magmablas_zlaset_identity(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, MagmaLeft, MagmaNoTrans, MagmaRight, max, min, and plasma_ceildiv().

32 {
33 
34  //%===========================
35  //% local variables
36  //%===========================
37  magma_int_t LDT,LDV,blklen,firstcolj;
38  magma_int_t bg, nbGblk,rownbm, k, m, n;
39  magma_int_t st,ed,fst,vlen,vnb,colj,len;
40  magma_int_t blkid, vpos,taupos,tpos;
41  //magmaDoubleComplex *WORK;
42  magma_int_t LWORK;
43  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
44  magma_int_t nbgr, colst, coled, versionL,versionR;
45  magma_int_t blkcnt=-1;
46 
47  *INFO=0;
48  versionL = 113;
49  versionR = 92;
50  LDT = Vblksiz;
51  LDV = NB+Vblksiz-1;
52  blklen = LDV*Vblksiz;
53  nbGblk = plasma_ceildiv((N-1),Vblksiz);
54  //WORK = (magmaDoubleComplex *) malloc (LWORK*sizeof(magmaDoubleComplex));
55 
56  /* find the size of the matrix T V*/
57  findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
58  /* Copy E & V & T to the GPU in dE and dV and dT
59  * depending on copytype:
60  * 1: mean copy only V
61  * 2: mean copy V and T
62  * 3: mean copy V, T and E
63  * */
64  if(copytype>0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
65  if(copytype>1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
66  if(copytype>2) magma_zsetmatrix( N, NE, E, N, dE, N );
67  magmaDoubleComplex *dwork;
68  magma_int_t ldwork;
69  ldwork = NE;
70  LWORK = 2*N*max(Vblksiz,64);
71  if(MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) {
72  printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" );
73  exit(-1);
74  }
75 
76  /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
77  * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
78  /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
79  * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */
80 
81  /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
82  * So we use the reverse order from small q to large one,
83  * so from q_n to q_1 so Left update to Identity.
84  * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
85  * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
86  * */
87  if(WANTZ==1)
88  {
89  versionL=113;
90  SIDE='L';
91  //set the matrix to Identity here to avoid copying it from the CPU
92  magmablas_zlaset_identity(N, N, dE, N);
93  }
94 
95 
96 
97  printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",
98  (int) N, (int) NB, (int) Vblksiz, (int) SIDE, (int) versionL, (int) versionR, (int) WANTZ);
99 
100 
101 #if defined(USESTREAM)
102  magma_int_t N2=N/2;
103  magma_int_t N1=N-N2;
104  printf("using stream\n");
105  magma_queue_t stream[2];
106  magma_queue_create( &stream[0] );
107  magma_queue_create( &stream[1] );
108 #endif
109 
110 
111  if(SIDE=='L'){
112  if(versionL==113){
113  for (bg = nbGblk; bg>0; bg--)
114  {
115  firstcolj = (bg-1)*Vblksiz + 1;
116  if(bg==nbGblk)
117  rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
118  else
119  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
120 
121  for (m = rownbm; m>0; m--)
122  {
123  vlen = 0;
124  vnb = 0;
125  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
126  fst = (rownbm -m)*NB+colj +1;
127  for (k=0; k<Vblksiz; k++)
128  {
129  colj = (bg-1)*Vblksiz + k;
130  st = (rownbm -m)*NB+colj +1;
131  ed = min(st+NB-1,N-1);
132  if(st>ed)break;
133  if((st==ed)&&(colj!=N-2))break;
134  vlen=ed-fst+1;
135  vnb=k+1;
136  }
137  colst = (bg-1)*Vblksiz;
138  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
139  printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1);
140  if((vlen>0)&&(vnb>0)){
141  if(WANTZ==1){
142  len = N-colst;
143  magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
144  }else{
145  magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
146  }
147  }
148  }
149  }
150  }else if(versionL==114){
151  rownbm = plasma_ceildiv((N-1),NB);
152  for (m = rownbm; m>0; m--)
153  {
154  ncolinvolvd = min(N-1, m*NB);
155  avai_blksiz=min(Vblksiz,ncolinvolvd);
156  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
157  for (n = nbgr; n>0; n--)
158  {
159  vlen = 0;
160  vnb = 0;
161  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
162  colst = (n-1)*avai_blksiz;
163  coled = colst + cur_blksiz -1;
164  fst = (rownbm -m)*NB+colst +1;
165  for (colj=colst; colj<=coled; colj++)
166  {
167  st = (rownbm -m)*NB+colj +1;
168  ed = min(st+NB-1,N-1);
169  if(st>ed)break;
170  if((st==ed)&&(colj!=N-2))break;
171  vlen=ed-fst+1;
172  vnb=vnb+1;
173  }
174  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
175  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
176  if((vlen>0)&&(vnb>0)){
177  #if defined(USESTREAM)
178  magmablasSetKernelStream(stream[0]);
179  magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1);
180  magmablasSetKernelStream(stream[1]);
181  magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2);
182 
183  #else
184  magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
185  #endif
186  }
187  }
188  }
189  }
190  }else if (SIDE=='R'){
191  if(versionR==91){
192  for (bg =1; bg<=nbGblk; bg++)
193  {
194  firstcolj = (bg-1)*Vblksiz + 1;
195  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
196  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
197  for (m = 1; m<=rownbm; m++)
198  {
199  vlen = 0;
200  vnb = 0;
201  // for k=0;I compute the fst and then can remove it from the loop
202  colj = (bg-1)*Vblksiz;
203  fst = (rownbm -m)*NB+colj +1;
204  for (k=0; k<Vblksiz; k++)
205  {
206  colj = (bg-1)*Vblksiz + k;
207  st = (rownbm -m)*NB+colj +1;
208  ed = min(st+NB-1,N-1);
209  if(st>ed)break;
210  if((st==ed)&&(colj!=N-2))break;
211  vlen=ed-fst+1;
212  vnb=k+1;
213  }
214  colj = (bg-1)*Vblksiz;
215  findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
216  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colj,vpos,taupos);
217  if((vlen>0)&&(vnb>0)){
218  #if defined(USESTREAM)
219  magmablasSetKernelStream(stream[0]);
220  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
221  magmablasSetKernelStream(stream[1]);
222  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
223  #else
224  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
225  #endif
226  }
227  }
228  }
229  }else if(versionR==92){
230  rownbm = plasma_ceildiv((N-1),NB);
231  for (m = 1; m<=rownbm; m++)
232  {
233  ncolinvolvd = min(N-1, m*NB);
234  avai_blksiz=min(Vblksiz,ncolinvolvd);
235  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
236  for (n = 1; n<=nbgr; n++)
237  {
238  vlen = 0;
239  vnb = 0;
240  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
241  colst = (n-1)*avai_blksiz;
242  coled = colst + cur_blksiz -1;
243  fst = (rownbm -m)*NB+colst +1;
244  for (colj=colst; colj<=coled; colj++)
245  {
246  st = (rownbm -m)*NB+colj +1;
247  ed = min(st+NB-1,N-1);
248  if(st>ed)break;
249  if((st==ed)&&(colj!=N-2))break;
250  vlen=ed-fst+1;
251  vnb=vnb+1;
252  }
253  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
254  if((vlen>0)&&(vnb>0)){
255  #if defined(USESTREAM)
256  magmablasSetKernelStream(stream[0]);
257  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
258  magmablasSetKernelStream(stream[1]);
259  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
260  #else
261  magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
262  #endif
263  }
264  }
265  }
266  }
267  }else{
268  printf("ERROR SIDE %d \n",SIDE);
269  }
270 
271 #if defined(USESTREAM)
273  magma_queue_destroy( stream[0] );
274  magma_queue_destroy( stream[1] );
275 #endif
276 
277 
278 }
#define min(a, b)
Definition: common_magma.h:86
#define T(m)
void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV)
#define MagmaLeft
Definition: magma.h:68
#define magma_queue_create(queuePtr)
Definition: magma.h:113
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
#define dE(m, n)
magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b)
int magma_int_t
Definition: magmablas.h:12
void magmablas_zlaset_identity(magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda)
#define dV(m)
#define magma_queue_destroy(queue)
Definition: magma.h:116
magma_int_t magma_zlarfb_gpu(char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *dv, magma_int_t ldv, cuDoubleComplex *dt, magma_int_t ldt, cuDoubleComplex *dc, magma_int_t ldc, cuDoubleComplex *dowrk, magma_int_t ldwork)
Definition: zlarfb_gpu.cpp:21
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define dwork(dev, i, j)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
#define V(m)
#define MagmaForward
Definition: magma.h:71
#define dT(m)
#define MAGMA_SUCCESS
Definition: magma.h:106
#define MagmaRight
Definition: magma.h:69
#define MagmaNoTrans
Definition: magma.h:57
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
#define max(a, b)
Definition: common_magma.h:82
#define E(m, n)
#define MagmaColumnwise
Definition: magma.h:74

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zhetrd_bhe2trc ( magma_int_t  THREADS,
magma_int_t  WANTZ,
char  uplo,
magma_int_t  NE,
magma_int_t  N,
magma_int_t  NB,
magmaDoubleComplex *  A1,
magma_int_t  LDA1,
double *  D2,
double *  E2,
magmaDoubleComplex *  dT1,
magma_int_t  LDT1 
)

Definition at line 121 of file zhetrd_bhe2trc_v3.cpp.

References gbstrct_blg::A, applyQ_parallel_section(), gbstrct_blg::BAND, CHECK, cmp_vals(), core_in_all, gbstrct_blg::cores_num, gbstrct_blg::dE, gbstrct_blg::dQ1, dQ2, gbstrct_blg::dT1, gbstrct_blg::dT2, gbstrct_blg::dV2, gbstrct_blg::E, gbstrct_blg::E_CPU, findVTsiz(), gbstrct_blg::grsiz, lapackf77_dsterf, lapackf77_zhbtrd(), lapackf77_zlacpy, gbstrct_blg::LDA, gbstrct_blg::LDE, gbstrct_blg::locores_num, magma_device_sync(), MAGMA_ERR_DEVICE_ALLOC, magma_free, magma_free_cpu(), magma_setlapack_numthreads(), MAGMA_SUCCESS, magma_wtime(), MAGMA_Z_ONE, MAGMA_Z_REAL, MAGMA_Z_ZERO, magma_zbulge_applyQ(), magma_zgemm(), magma_zgetmatrix, magma_zmalloc(), magma_zsetmatrix, magma_zstedc_withZ(), magma_zstedx_withZ(), magma_zungqr_2stage_gpu(), MagmaLower, MagmaNoTrans, MagmaNoVec, MagmaVec, max, MAX_THREADS_BLG, min, gbstrct_blg::N, gbstrct_blg::N_CPU, gbstrct_blg::N_GPU, gbstrct_blg::NB, gbstrct_blg::NBTILES, gbstrct_blg::NE, gbstrct_blg::overlapQ1, parallel_section(), pthread_attr_init(), pthread_attr_setscope(), pthread_create(), pthread_join(), PTHREAD_SCOPE_SYSTEM, pthread_setconcurrency(), gbstrct_blg::SIDE, ss_prog, gbstrct_blg::ss_prog, T, gbstrct_blg::T, TAU, gbstrct_blg::TAU, gbstrct_blg::timeaplQ, gbstrct_blg::timeblg, gbstrct_blg::usemulticpu, V, gbstrct_blg::V, gbstrct_blg::Vblksiz, gbstrct_blg::WANTZ, Z, and zcheck_eig_().

122 {
123  char uplo_[2] = {uplo, 0};
124  real_Double_t timelpk=0.0,tconvert=0.0,timeaplQ1=0.0,timeaplQ2=0.0,timeblg=0.0, timeaplQ=0.0, timeeigen=0.0, timegemm=0.0;
125  double nrmI=0.0, nrm1=0.0, nrm2=0.0;
126  FILE *trace_file;
127  magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
128  magmaDoubleComplex c_one = MAGMA_Z_ONE;
129 
130 
131  magma_int_t mklth, thread, INFO;
132  magma_int_t Vblksiz=-1, blkcnt=-1, LDV=-1, LDT =-1, INgrsiz=1, LDE=-1, BAND=6;
133 #if (defined(PRECISION_s) || defined(PRECISION_d))
134  Vblksiz = min(NB,48);
135 #else
136  Vblksiz = min(NB,32);
137 #endif
138  LDT = Vblksiz;
139  findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
140 
141  magma_int_t i,j;
142  magma_int_t NBTILES, LDA2;
143  i = N/NB;
144  NBTILES = i*NB==N ? i:i+1;
145 
146  magma_int_t overlapQ1 = 1;
147  magma_int_t usemulticpu = 1;
148  magma_int_t N_CPU=0,N_GPU=N;
149 
150  //if(N<4000)
151  // usemulticpu =0;
152 
153  if((WANTZ!=3)&(WANTZ!=2))
154  usemulticpu =0;
155 
156  core_in_all.usemulticpu = usemulticpu;
157  core_in_all.overlapQ1 = overlapQ1;
158  /*************************************************
159  * INITIALIZING MATRICES
160  * ***********************************************/
161  /* A1 is equal to A2 */
162  LDA2 = NB+1+(NB-1);
163  if(LDA2>N){
164  printf("error matrix too small N=%d NB=%d\n", (int) N, (int) NB);
165  return -14;
166  }
167  if((N<=0)||(NB<=0)){
168  printf("error N or NB <0 N=%d NB=%d\n", (int) N, (int) NB);
169  return -14;
170  }
171 
172 
173 /*
174 
175  trace_file = fopen("AJETE/Abefore", "w");
176  for (j = 0; j < (N-NB); j++)
177  {
178  for (i = 0; i < NB+1; i++)
179  {
180  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+j+1,j+1,MAGMA_Z_REAL(A1[i+j*LDA1+j]) , MAGMA_Z_IMAG(A1[i+j*LDA1+j]) );
181  }
182  }
183  for (j = 0; j < NB; j++)
184  {
185  for (i = 0; i < NB-j; i++)
186  {
187  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+(N-NB+j)+1,N-NB+j+1,MAGMA_Z_REAL(A1[i+(N-NB+j)*LDA1+(N-NB+j)]) , MAGMA_Z_IMAG(A1[i+(N-NB+j)*LDA1+(N-NB+j)]) );
188  }
189  }
190 
191 
192  fclose(trace_file);
193 */
194 
195 
196 
197 
198 
199  timelpk = magma_wtime();
200  /* copy the input matrix into a matrix A2 with band storage */
201  magmaDoubleComplex *A2 = (magmaDoubleComplex *) malloc (N*LDA2*sizeof(magmaDoubleComplex));
202  memset(A2 , 0, N*LDA2*sizeof(magmaDoubleComplex));
203  for (j = 0; j < (N-NB); j++)
204  {
205  for (i = 0; i < NB+1; i++)
206  {
207  A2[i+j*LDA2] = A1[i+j*LDA1+j];
208  A1[i+j*LDA1+j] = c_zero;
209  }
210  }
211  for (j = 0; j < NB; j++)
212  {
213  for (i = 0; i < NB-j; i++)
214  {
215  A2[i+(N-NB+j)*LDA2] = A1[i+(N-NB+j)*LDA1+(N-NB+j)];
216  A1[i+(N-NB+j)*LDA1+(N-NB+j)] = c_zero;
217  }
218  }
219 
220  for (j = 0; j < N-NB; j++)
221  A1[NB+j*LDA1+j] = c_one;
222 
223  timelpk = magma_wtime() - timelpk;
224  printf(" Finish CONVERT timing= %lf \n" ,timelpk);
225 
226 
227 
228  /*************************************************
229  * compute Q1
230  * ***********************************************/
231  magmaDoubleComplex *da, *NOTUSED;
232  if((WANTZ>0)&(WANTZ!=4)){
233  if (MAGMA_SUCCESS != magma_zmalloc( &da, N*LDA1 )) {
234  //*info = MAGMA_ERR_DEVICE_ALLOC;
235  return MAGMA_ERR_DEVICE_ALLOC;
236  }
237 
239  magma_zsetmatrix( N, LDA1, A1, LDA1, da, LDA1 );
240  if(overlapQ1==0){
241  timeaplQ1 = magma_wtime();
242  magma_zungqr_2stage_gpu(N, N, N, da, LDA1, NOTUSED, dT1, NB, &INFO);
244  //cublasGetMatrix( N, LDA1, sizeof(magmaDoubleComplex), da, LDA1, A1, LDA1);
245  timeaplQ1 = magma_wtime()-timeaplQ1;
246  printf(" Finish applyQ1 timing= %lf \n" ,timeaplQ1);
247  }
248  /*
249  trace_file = fopen("AJETE/Q1", "w");
250  for (j = 0; j < N ; j++)
251  for (i = 0; i < N ; i++)
252  //fprintf(trace_file,"%10d%10d%40.30e\n",i+1,j+1,A1[j*LDA1+i]);
253  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+1,j+1,MAGMA_Z_REAL(A1[j*LDA1+i]) , MAGMA_Z_IMAG(A1[j*LDA1+i]) );
254  fclose(trace_file);
255  */
256  }
257  /***********************************************/
258 
259 
260  /*************************************************
261  * For local check use
262  * ***********************************************/
263  char JOBZ;
264  double *D1;
265  magmaDoubleComplex *AINIT;
266  magma_int_t MM,NN,LDAINIT;
267  if(CHECK)
268  {
269  MM=NB+1;
270  NN=N;
271  LDAINIT=NB+1;
272  AINIT = (magmaDoubleComplex *) malloc(MM*NN*sizeof(magmaDoubleComplex) );
273  memset(AINIT , 0, MM*NN*sizeof(magmaDoubleComplex));
274  lapackf77_zlacpy("A", &MM, &NN, A2, &LDA2, AINIT, &LDAINIT );
275  }
276  /***********************************************/
277 
278 
279  magmaDoubleComplex *T = (magmaDoubleComplex *) malloc (blkcnt*LDT*Vblksiz*sizeof(magmaDoubleComplex));
280  magmaDoubleComplex *TAU = (magmaDoubleComplex *) malloc (blkcnt*Vblksiz*sizeof(magmaDoubleComplex));
281  magmaDoubleComplex *V = (magmaDoubleComplex *) malloc (blkcnt*LDV*Vblksiz*sizeof(magmaDoubleComplex));
282  memset(T, 0, blkcnt*LDT*Vblksiz*sizeof(magmaDoubleComplex));
283  memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex));
284  memset(V, 0, blkcnt*LDV*Vblksiz*sizeof(magmaDoubleComplex));
285 
286 
287 
288 
289 
290  // Set input parameters
291  // ssched_init(THREADS);
292  ss_prog = (int*) malloc((2*NBTILES+THREADS+10)*sizeof(int));
293  int iamdone[MAX_THREADS_BLG];
294  int thread_num[MAX_THREADS_BLG];
295  pthread_t thread_id[MAX_THREADS_BLG];
296  pthread_attr_t thread_attr;
297 
298  //goto ed;
299 
301  core_in_all.cores_num = THREADS;
302  core_in_all.dQ1 = da;
303  core_in_all.dT1 = dT1;
304  core_in_all.T = T;
305  core_in_all.A = A2;
306  core_in_all.TAU = TAU;
307  core_in_all.V = V;
308  core_in_all.NB = NB;
309  core_in_all.NBTILES = NBTILES;
310  core_in_all.N = N;
311  core_in_all.LDA = LDA2;
312  core_in_all.BAND = BAND;
313  core_in_all.grsiz = INgrsiz;
314  core_in_all.timeblg = &timeblg;//0.0;
315  core_in_all.timeaplQ = &timeaplQ;//0.0;
317  core_in_all.WANTZ = WANTZ;
318  core_in_all.SIDE = 'R';
319  core_in_all.E = A1;
320  core_in_all.E_CPU = A1;
321  core_in_all.LDE = LDA1;
322  core_in_all.Vblksiz = Vblksiz;
323 
324  if((overlapQ1==1)&&(THREADS>1)&&(WANTZ>0)){
325  core_in_all.locores_num = THREADS-1;
326  }else{
327  core_in_all.locores_num = THREADS;
328  }
329 
330 
331  // Set one thread per core
332  pthread_attr_init(&thread_attr);
334  pthread_setconcurrency(THREADS);
335 
336  // Initializations
337  for (thread = 0; thread < THREADS; thread++)
338  {
339  barrier_in[thread] = 0;
340  barrier_out[thread] = 0;
341  event_numblg[thread] = 0;
342  }
343 
344  for (i = 0; i < 2*NBTILES+THREADS+10; i++)
345  ss_prog[i] = 0;
346 
347 
348  for (i = 0; i < THREADS; i++)
349  iamdone[i] = 0;
350 
351 
352 
353  // Launch threads
354  for (thread = 1; thread < THREADS; thread++)
355  {
356  thread_num[thread] = thread;
357  pthread_create(&thread_id[thread], &thread_attr, parallel_section, &thread_num[thread]);
358  }
359  thread_num[0] = 0;
360  parallel_section(&thread_num[0]);
361 
362  // Wait for completion
363  for (thread = 1; thread < THREADS; thread++)
364  {
365  void *exitcodep;
366  pthread_join(thread_id[thread], &exitcodep);
367  }
368  printf(" Finish BULGE+T timing= %lf \n" ,*(core_in_all.timeblg));
369 
370  /*================================================
371  * store resulting diag and lower diag D2 and E2
372  * note that D and E are always real
373  *================================================*/
374  /*
375  * STORE THE RESULTING diagonal/off-diagonal in D AND E
376  */
377  memset(D2, 0, N *sizeof(double));
378  memset(E2, 0, (N-1)*sizeof(double));
379  /* Make diagonal and superdiagonal elements real,
380  * storing them in D and E
381  */
382  /* In complex case, the off diagonal element are
383  * not necessary real. we have to make off-diagonal
384  * elements real and copy them to E.
385  * When using HouseHolder elimination,
386  * the ZLARFG give us a real as output so, all the
387  * diagonal/off-diagonal element except the last one are already
388  * real and thus we need only to take the abs of the last
389  * one.
390  * */
391 
392 
393 /*
394  trace_file = fopen("AJETE/T", "w");
395  for (j = 0; j < N-1 ; j++) {
396  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",j+1,j+1,MAGMA_Z_REAL(A2[j*LDA2]) , MAGMA_Z_IMAG(A2[j*LDA2]) );
397  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",j+2,j+1,MAGMA_Z_REAL(A2[j*LDA2+1]) , MAGMA_Z_IMAG(A2[j*LDA2+1]) );
398  }
399  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",N,N,MAGMA_Z_REAL(A2[(N-1)*LDA2]) , MAGMA_Z_IMAG(A2[(N-1)*LDA2]) );
400 
401  fclose(trace_file);
402 */
403 
404 
405 
406 
407 
408 #if defined(PRECISION_z) || defined(PRECISION_c)
409  if(uplo==MagmaLower){
410  for (i=0; i < N-1 ; i++)
411  {
412  D2[i] = MAGMA_Z_REAL( A2[i*LDA2]);
413  E2[i] = MAGMA_Z_REAL(A2[i*LDA2+1]);
414  }
415  D2[N-1] = MAGMA_Z_REAL(A2[(N-1)*LDA2]);
416  } else { /* MagmaUpper not tested yet */
417  for (i=0; i<N-1; i++)
418  {
419  D2[i] = MAGMA_Z_REAL(A2[i*LDA2+NB]);
420  E2[i] = MAGMA_Z_REAL(A2[i*LDA2+NB-1]);
421  }
422  D2[N-1] = MAGMA_Z_REAL(A2[(N-1)*LDA2+NB]);
423  } /* end MagmaUpper */
424 #else
425  if( uplo == MagmaLower ){
426  for (i=0; i < N-1; i++) {
427  D2[i] = A2[i*LDA2]; // diag
428  E2[i] = A2[i*LDA2+1]; //lower diag
429  }
430  D2[N-1] = A2[(N-1)*LDA2];
431  } else {
432  for (i=0; i < N-1; i++) {
433  D2[i] = A2[i*LDA2+NB]; // diag
434  E2[i] = A2[i*LDA2+NB-1]; //lower diag
435  }
436  D2[N-1] = A2[(N-1)*LDA2+NB];
437  }
438 #endif
439 
440 
441 
442 
443 
444 
445  /* APPLY V2 generated by the bulge to the matrix Q1 of the first stage Q2=Q2*(I-V_2*T_2*V_2') */
446  magmaDoubleComplex *dV2, *dQ2, *dT2, *dZ, *Q2, *Z;
447  int dVsize, LDQ2=N,LDZ=N;
448  int parallel=0;
449 
450 
451  if((usemulticpu==1)&&(THREADS>1)){
452  core_in_all.locores_num = THREADS-1;
453  }else{
454  core_in_all.locores_num = THREADS;
455  }
456 
457  //========================
458  // WANTZ =1 : compute Q1 and Q2 and the eigenvectors Z
459  // then make 2 GEMM's Q1*Q2*Z
460  // WANTZ =2 : compute Q1 then apply V2 to Q1 from right
461  // generating the global Q, then compute
462  // eigenvectors Z and make GEMM with Z==> Q*Z on GPU
463  // WANTZ=5 is similar, where the GEMM is done
464  // implicitly during the eigenvectors computation.
465  // assumed to be bad in perf.
466  // WANTZ =3 : compute Q1, then compute the eigenvectors Z,
467  // then apply V2 to the Z from Left, then make
468  // GEMM with Q1 ==> Q1 * resulting
469  // WANTZ =4 : So compute the eigenvectors Z,
470  // then apply V2, then apply V1 to the result.
471  // similar to WANTZ=3 but does not compute Q1
472  // instead it apply V1 (I guess to be good for
473  // multicore because it avoid computing Q1, while
474  // on GPU's Q1 is computed overlaped with the
475  // bulgechasing so for free so it is better to do
476  // a GEMM with Q1 (WANTZ=3) then to do the apply).
477  // WANTZ =5 : compute Q1 then apply V2 to Q1 from right
478  // generating the global Q, then compute
479  // eigenvectors Z with option "V" that make
480  // GEMM with Z==> Q*Z implicitly during the
481  // eigenvectors computation assumed to be bad in perf
482  // because a GEMM on GPU or optimized
483  // for multithread will be faster.
484  // Similar to WANTZ=2 where the GEMM is done out of the eigensolver
485  //
486  //========================
487 
488  timeeigen = magma_wtime();
489  if(WANTZ==0){
490  timelpk = magma_wtime();
491  // compute the eigenvalues using lapack routine to be able to compare to it and used as ref
493 
494  // call eigensolver for our resulting tridiag [D E] and form E=Q*Z
495  magma_zstedc_withZ(MagmaNoVec, N, D2, E2, Z, LDZ);
497  timelpk = magma_wtime()-timelpk;
498  printf(" Finish WANTZ %d eigensolver 'N' timing= %lf threads %d \n", (int) WANTZ, timelpk, (int) i);
499  /*
500  for(i=0;i<10;i++)
501  printf(" voici D[%d] %e\n",i,D2[i]);*/
502  }
503  if(WANTZ>0){
504  if(WANTZ==1){
505  // compute Q2 by applying V2 on the Identity
506  LDZ=LDA1;
507  Z = (magmaDoubleComplex *) malloc (N*N*sizeof(magmaDoubleComplex));
508  memset(Z , 0, N*N*sizeof(magmaDoubleComplex));
509  if(parallel==1){
510  Q2 = (magmaDoubleComplex *) malloc (N*N*sizeof(magmaDoubleComplex));
511  // no need to set the matrix to identity if we are using the GPU because it is done inside bulge_applyQ
512  memset(Q2 , 0, N*N*sizeof(magmaDoubleComplex));
513  for (j = 0; j < N; j++)
514  Q2[j+j*LDQ2] = c_one;
515  }
516  core_in_all.SIDE = 'L';
517  core_in_all.E = Q2;
518  core_in_all.LDE = LDQ2;
519  }else if(WANTZ==2){
520  // compute the Q by applying V2 to Q1 that has been computed
521  // from the Right. Q= Q1 * ( I-V2*T2*V2') = Q1*Q2
522  LDZ=LDA1;
523  Z = (magmaDoubleComplex *) malloc (N*N*sizeof(magmaDoubleComplex));
524  memset(Z , 0, N*N*sizeof(magmaDoubleComplex));
525  core_in_all.SIDE = 'R';
526  core_in_all.E = A1;
527  core_in_all.LDE = LDA1;
528  }else if((WANTZ==3)||(WANTZ==4)){
529  // wait till the Eigenvector are computed and the apply V2 from the Left,
530  // and then make gemm with Q1 (WANTZ=3) or apply V1 to the result (WANTZ=4).
531  LDZ=LDA1;
532  Z = (magmaDoubleComplex *) malloc (N*N*sizeof(magmaDoubleComplex));
533  memset(Z , 0, N*N*sizeof(magmaDoubleComplex));
534  // variable for Q2
535  core_in_all.SIDE = 'L';
536  core_in_all.E = Z;
537  core_in_all.LDE = LDZ;
538  if(WANTZ==4) {
539  printf("the implementation is not finished yet. code will exit\n");
540  exit(-1);
541  }
542  }
543 
544 
545  if((WANTZ==1)||(WANTZ==2)||(WANTZ==3)||(WANTZ==4)){
546  timelpk = magma_wtime();
547  // compute the eigenvalues using lapack routine to be able to compare to it and used as ref
549  // call eigensolver for our resulting tridiag [D E] and form E=Q*Z
550  //magma_zstedc_withZ('I', N, D2, E2, Z, LDZ);
551  magma_zstedx_withZ(N, NE, D2, E2, Z, LDZ);
553  timelpk = magma_wtime()-timelpk;
554  }
555 
556  /*
557  trace_file = fopen("AJETE/Z", "w");
558  for (j = 0; j < N ; j++)
559  for (i = 0; i < N ; i++)
560  //fprintf(trace_file,"%10d%10d%40.30e\n",i+1,j+1,A1[j*LDA1+i]);
561  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+1,j+1,MAGMA_Z_REAL(Z[j*LDA1+i]) , MAGMA_Z_IMAG(Z[j*LDA1+i]) );
562  fclose(trace_file);
563  */
564 
565 
566 
567  // ************************************************
568  // use GPU code to apply V2 or compute Q2
569  // ************************************************
570  if(parallel==0){
571  // allocate space on GPU for dV2 and dT2
572  dVsize = max(N*N,blkcnt*LDV*Vblksiz);
573  //printf("dvsize %lf \n",(16.0*(real_Double_t)dVsize)*1e-9);
574  if(MAGMA_SUCCESS != magma_zmalloc( &dV2, dVsize )) {
575  printf ("!!!! magma_alloc failed for: dV2\n" );
576  exit(-1);
577  }
578 
579  if(MAGMA_SUCCESS != magma_zmalloc( &dT2, blkcnt*LDT*Vblksiz )) {
580  printf ("!!!! magma_alloc failed for: dT2\n" );
581  exit(-1);
582  }
583 
584  //========================
585  // WANTZ =1
586  // compute Q1 done
587  // compute Q2 here
588  // compute Z done
589  // make 2 GEMM here
590  //========================
591  if(WANTZ==1){
592  // to avoid allocating a 4 matrix on GPU there are 2 way:
593  // way 1: make Q = Q1*Q2 ==> dV2 = da * dZ
594  // then copy Z to GPU put it on dZ
595  // make Q*Z ==> da = dV2 * dZ
596  // copy the result to CPU, da-->A1
597  //
598  // way 2: in case we need only 20% of Z so it is better
599  // to start multiplying by Q2*Z, thus to avoid a matrix on GPU
600  // I could copy Q1 (from da) back to CPU and later re-copy to GPU
601  // copy Q1 to CPU, so copy da --> A1
602  // copy Z to GPU, Z --> dZ
603  // make Q = Q2*Z ==> dV2 = da * dZ
604  // copy Q1 to GPU, A1 --> dZ
605  // make Q1*Q ==> da = dZ *dV2
606  // copy the result to CPU, da-->A1
607  //
608  // way 2 is implemented because of Raffaele
609  // NE is the number of eigenvectors we want.
610  timeaplQ2 = magma_wtime();
611  magma_free( dT1 );
612  // copy Q1 to CPU
613  magma_zgetmatrix( N, LDA1, da, N, A1, LDA1 );
614  // compute Q2 by applying V2 to Identity and put it into da
615  magma_zbulge_applyQ(WANTZ, 'L', NE, N, NB, Vblksiz, Q2, N, V, TAU, T, &INFO, dV2, dT2, da, 2);
616  // free dT2 and allocate dZ and copy Z to dZ
618  timeaplQ2 = magma_wtime()-timeaplQ2;
619  magma_free( dT2 );
620  if(MAGMA_SUCCESS != magma_zmalloc( &dZ, N*N )) {
621  printf ("!!!! magma_alloc failed for: dZ\n" );
622  exit(-1);
623  }
624 
625  /*
626  Q2 = (magmaDoubleComplex *) malloc (N*N*sizeof(magmaDoubleComplex));
627  memset(Q2 , 0, N*N*sizeof(magmaDoubleComplex));
628  magma_zgetmatrix( N, LDA1, da, N, Q2, N );
629  trace_file = fopen("AJETE/Q2", "w");
630  for (j = 0; j < N ; j++)
631  for (i = 0; i < N ; i++)
632  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+1,j+1,MAGMA_Z_REAL(Q2[j*N+i]) , MAGMA_Z_IMAG(Q2[j*N+i]) );
633  fclose(trace_file);
634  */
635 
636  timegemm = magma_wtime();
637  // copy the eigenvectors to GPU
638  magma_zsetmatrix( N, LDZ, Z, LDZ, dZ, N );
639  // make GEMM Q2 * Z --> dV2 = da * dZ
640  magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NE, N, c_one, da, N, dZ, N, c_zero, dV2, N);
641  // copy Q1 to GPU --> dZ
642  magma_zsetmatrix( N, LDA1, A1, LDA1, dZ, N );
643  // make GEMM Q1 * (Q2 * Z) --> da = dZ * dV2
644  magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NE, N, c_one, dZ, N, dV2, N, c_zero, da, N);
645  magma_zgetmatrix( N, NE, da, N, A1, LDA1 );
646  timegemm = magma_wtime()-timegemm;
647  }
648  if(WANTZ==2){
649  /****************************************************
650  * apply V2 from Right to Q1. da = da*(I-V2*T2*V2')
651  * **************************************************/
652  timeaplQ2 = magma_wtime();
653  /*============================
654  * use GPU+CPU's
655  *==========================*/
656  if((usemulticpu==1)&&(THREADS>1))
657  {
658 
659  // define the size of Q to be done on CPU's and the size on GPU's
660  // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
661  if(THREADS>40){
662  N_GPU = (int) (0.5*(double)NE);
663  N_GPU = (N_GPU/64)*64;
664  N_CPU = NE-N_GPU;
665  }else if(THREADS>10){
666 #if (defined(PRECISION_s) || defined(PRECISION_d))
667  N_GPU = (int) (0.68*(double)NE);
668 #else
669  N_GPU = (int) (0.72*(double)NE);
670 #endif
671  N_GPU = (N_GPU/64)*64;
672  N_CPU = NE-N_GPU;
673  }else if(THREADS>5){
674 #if (defined(PRECISION_s) || defined(PRECISION_d))
675  N_GPU = (int) (0.84*(double)NE);
676 #else
677  N_GPU = (int) (0.86*(double)NE);
678 #endif
679  N_GPU = (N_GPU/64)*64;
680  N_CPU = NE-N_GPU;
681  }else{
682  N_GPU = NE;
683  N_CPU = 0;
684  }
685 
686  magma_zgetmatrix( N_CPU, LDA1,
687  da+(N_GPU), LDA1,
688  A1+(N_GPU), LDA1 );
689  printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n", (int) NE, (int) N_GPU, (int) N_CPU);
690  core_in_all.SIDE = 'R';
691  core_in_all.E = A1;
692  core_in_all.E_CPU = A1+(N_GPU);
693  core_in_all.LDE = LDA1;
694  core_in_all.dE = da;
695  core_in_all.dT2 = dT2;
696  core_in_all.dV2 = dV2;
697  core_in_all.N_CPU = N_CPU;
698  core_in_all.N_GPU = N_GPU;
699  core_in_all.NE = NE;
700  core_in_all.T = T;
701  core_in_all.TAU = TAU;
702  core_in_all.V = V;
703 
704 
705  // ===============================
706  // relaunch thread to apply Q
707  // ===============================
708  // Set one thread per core
709  pthread_attr_init(&thread_attr);
711  pthread_setconcurrency(THREADS);
712 
713  // Initializations
714  for (thread = 0; thread < THREADS; thread++)
715  {
716  barrier_in[thread] = 0;
717  barrier_out[thread] = 0;
718  event_numblg[thread] = 0;
719  }
720  // Launch threads
721  for (thread = 1; thread < THREADS; thread++)
722  {
723  thread_num[thread] = thread;
724  pthread_create(&thread_id[thread], &thread_attr, applyQ_parallel_section, &thread_num[thread]);
725  }
726  thread_num[0] = 0;
727  applyQ_parallel_section(&thread_num[0]);
728 
729  // Wait for completion
730  for (thread = 1; thread < THREADS; thread++)
731  {
732  void *exitcodep;
733  pthread_join(thread_id[thread], &exitcodep);
734  }
735  magma_zsetmatrix( N_CPU, LDA1,
736  A1+(N_GPU), LDA1,
737  da+(N_GPU), LDA1 );
738 
739 
740 
741 
742 
743 
744  /*============================
745  * use only GPU
746  *==========================*/
747  }else{
748  magma_zbulge_applyQ(WANTZ, 'R', N, N, NB, Vblksiz, A1, LDA1, V, TAU, T, &INFO, dV2, dT2, da, 2);
750  magma_free( dT2 );
751  }
752  timeaplQ2 = magma_wtime()-timeaplQ2;
753 
754  /****************************************************
755  * compute the GEMM of Q*Z
756  * **************************************************/
757  magma_free( dT1 );
758  if(MAGMA_SUCCESS != magma_zmalloc( &dZ, N*NE )) {
759  printf ("!!!! magma_alloc failed for: dZ\n" );
760  exit(-1);
761  }
762  timegemm = magma_wtime();
763  // copy the eigenvectors to GPU
764  magma_zsetmatrix( N, NE, Z, LDZ, dZ, N );
765  //make a gemm of (Q1 * Q2) * Z = da * dZ --> dV2
766  magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NE, N, c_one, da, N, dZ, N, c_zero, dV2, N);
767  magma_zgetmatrix( N, NE, dV2, N, A1, LDA1 );
768  timegemm = magma_wtime()-timegemm;
769  }
770 
771  if(WANTZ==3){
772  /****************************************************
773  * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
774  * **************************************************/
775  magma_free( dT1 );
776  if(MAGMA_SUCCESS != magma_zmalloc( &dZ, N*NE )) {
777  printf ("!!!! magma_alloc failed for: dZ\n" );
778  exit(-1);
779  }
780  timeaplQ2 = magma_wtime();
781  /*============================
782  * use GPU+CPU's
783  *==========================*/
784  if((usemulticpu==1)&&(THREADS>1))
785  {
786 
787  // define the size of Q to be done on CPU's and the size on GPU's
788  // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
789  if(THREADS>40){
790  N_GPU = (int) (0.5*(double)NE);
791  N_GPU = (N_GPU/64)*64;
792  N_CPU = NE-N_GPU;
793  }else if(THREADS>10){
794 #if (defined(PRECISION_s) || defined(PRECISION_d))
795  N_GPU = (int) (0.68*(double)NE);
796 #else
797  N_GPU = (int) (0.72*(double)NE);
798 #endif
799  N_GPU = (N_GPU/64)*64;
800  N_CPU = NE-N_GPU;
801  }else if(THREADS>5){
802 #if (defined(PRECISION_s) || defined(PRECISION_d))
803  N_GPU = (int) (0.82*(double)NE);
804 #else
805  N_GPU = (int) (0.84*(double)NE);
806 #endif
807  N_GPU = (N_GPU/64)*64;
808  N_CPU = NE-N_GPU;
809  }else{
810  N_GPU = NE;
811  N_CPU = 0;
812  }
813  printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n", (int) NE, (int) N_GPU, (int) N_CPU);
814  core_in_all.SIDE = 'L';
815  core_in_all.E = Z;
816  core_in_all.E_CPU = Z+(N_GPU*LDZ);
817  core_in_all.LDE = LDZ;
818  core_in_all.dE = dZ;
819  core_in_all.dT2 = dT2;
820  core_in_all.dV2 = dV2;
821  core_in_all.N_CPU = N_CPU;
822  core_in_all.N_GPU = N_GPU;
823  core_in_all.NE = NE;
824  core_in_all.T = T;
825  core_in_all.TAU = TAU;
826  core_in_all.V = V;
827 
828 
829  // ===============================
830  // relaunch thread to apply Q
831  // ===============================
832  // Set one thread per core
833  pthread_attr_init(&thread_attr);
835  pthread_setconcurrency(THREADS);
836 
837  // Initializations
838  for (thread = 0; thread < THREADS; thread++)
839  {
840  barrier_in[thread] = 0;
841  barrier_out[thread] = 0;
842  event_numblg[thread] = 0;
843  }
844  // Launch threads
845  for (thread = 1; thread < THREADS; thread++)
846  {
847  thread_num[thread] = thread;
848  pthread_create(&thread_id[thread], &thread_attr, applyQ_parallel_section, &thread_num[thread]);
849  }
850  thread_num[0] = 0;
851  applyQ_parallel_section(&thread_num[0]);
852 
853  // Wait for completion
854  for (thread = 1; thread < THREADS; thread++)
855  {
856  void *exitcodep;
857  pthread_join(thread_id[thread], &exitcodep);
858  }
859  magma_zsetmatrix( LDZ, N_CPU,
860  Z+(N_GPU*LDZ), LDZ,
861  dZ+(N_GPU*LDZ), N );
862 
863  /*============================
864  * use only GPU
865  *==========================*/
866  }else{
867  magma_zbulge_applyQ(WANTZ, 'L', NE, N, NB, Vblksiz, Z, LDZ, V, TAU, T, &INFO, dV2, dT2, dZ, 3);
869  }
870 
871  timeaplQ2 = magma_wtime()-timeaplQ2;
872  /****************************************************
873  * compute the GEMM of Q1 * (Q2*Z)
874  * **************************************************/
875  magma_free( dT2 );
876  printf("calling dgemm\n");
877  timegemm = magma_wtime();
878  //make a gemm of Q1 * (Q2 * Z) = Q1 * ((I-V2T2V2')*Z) = da * dZ --> dV2
879  magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NE, N, c_one, da, N, dZ, N, c_zero, dV2, N);
880  magma_zgetmatrix( N, NE, dV2, N, A1, LDA1 );
881  timegemm = magma_wtime()-timegemm;
882  }
883 
884  if(WANTZ==5){
885  if(NE!=N){
886  printf("WANTZ=5 is not supported with NE=%d it compute all the eigenvectors meaning that NE=N\n", (int) NE);
887  exit(-2);
888  }
889  timeaplQ2 = magma_wtime();
890  magma_zbulge_applyQ(WANTZ, 'R', NE, N, NB, Vblksiz, A1, LDA1, V, TAU, T, &INFO, dV2, dT2, da, 2);
891  magma_zgetmatrix( N, LDA1, da, N, A1, LDA1 );
892  timeaplQ2 = magma_wtime()-timeaplQ2;
893  timelpk = magma_wtime();
894  // compute the eigenvalues using lapack routine to be able to compare to it and used as ref
896  // call eigensolver for our resulting tridiag [D E] and form E=Q*Z
897  magma_zstedc_withZ(MagmaVec, N, D2, E2, A1, LDA1);
899  timelpk = magma_wtime()-timelpk;
900  }
901 
902  }
903  // ************************************************
904 
905 
906  // ************************************************
907  // use parallel cpu code to apply V2 or compute Q2
908  // need to be continued for Z
909  // ************************************************
910  if(parallel==1){
911  // copy the matrix Q1 generated up from GPU to CPU because we are using the CPU's.
912  if(WANTZ==2) {
913  magma_zgetmatrix( N, LDA1, da, LDA1, A1, LDA1);
914  }
915  // ===============================
916  // relaunch thread to apply Q
917  // ===============================
918  // Set one thread per core
919  pthread_attr_init(&thread_attr);
921  pthread_setconcurrency(THREADS);
922 
923  // Initializations
924  for (thread = 0; thread < THREADS; thread++)
925  {
926  barrier_in[thread] = 0;
927  barrier_out[thread] = 0;
928  event_numblg[thread] = 0;
929  }
930  // Launch threads
931  for (thread = 1; thread < THREADS; thread++)
932  {
933  thread_num[thread] = thread;
934  pthread_create(&thread_id[thread], &thread_attr, applyQ_parallel_section, &thread_num[thread]);
935  }
936  thread_num[0] = 0;
937  applyQ_parallel_section(&thread_num[0]);
938 
939  // Wait for completion
940  for (thread = 1; thread < THREADS; thread++)
941  {
942  void *exitcodep;
943  pthread_join(thread_id[thread], &exitcodep);
944  }
945  }
946  // ************************************************
947 
948  timeeigen = magma_wtime()-timeeigen;
949  printf("============================================================================\n");
950  printf(" Finish WANTZ %d computing Q2 timing= %lf \n", (int) WANTZ, timeaplQ2);
951  if(WANTZ!=5){
952  printf(" Finish WANTZ %d making gemm timing= %lf \n", (int) WANTZ, timegemm);
953  printf(" Finish WANTZ %d eigensolver 'I' timing= %lf threads %d N %d NE %d\n", (int) WANTZ, timelpk, (int) mklth, (int) N, (int) NE);
954  }else{
955  printf(" Finish WANTZ %d eigensolver 'V' timing= %lf threads %d N %d NE %d \n", (int) WANTZ, timelpk, (int) mklth, (int) N, (int) NE);
956  }
957  printf(" Finish WANTZ %d full Eigenvectros timing= %lf \n",(int) WANTZ, timeeigen);
958  printf("============================================================================\n");
959  }
960 
961 
962 
963 
964 
965  /*
966  trace_file = fopen("AJETE/U", "w");
967  for (j = 0; j < N ; j++)
968  for (i = 0; i < N ; i++)
969  //fprintf(trace_file,"%10d%10d%40.30e\n",i+1,j+1,A1[j*LDA1+i]);
970  fprintf(trace_file,"%10d %10d %25.15e %25.15e\n",i+1,j+1,MAGMA_Z_REAL(A1[j*LDA1+i]) , MAGMA_Z_IMAG(A1[j*LDA1+i]) );
971  fclose(trace_file);
972 
973  trace_file = fopen("AJETE/D", "w");
974  for (j = 0; j < N ; j++)
975  fprintf(trace_file,"%10d%10d%40.30e\n",j+1,1,D2[j]);
976  fclose(trace_file);
977  */
978 
979 
980 
981 /*
982  magmaDoubleComplex mydz=c_zero,mydo=c_one;
983  magmaDoubleComplex *Z = (magmaDoubleComplex *) malloc(N*N*sizeof(magmaDoubleComplex));
984  dgemm_("N","N",&N,&N,&N,&mydo,A1,&LDA1,Q1,&LDQ1,&mydz,Z,&LDA1);
985  memcpy(A1,Z,N*N*sizeof(magmaDoubleComplex));
986  memcpy(Q1,Z,N*N*sizeof(magmaDoubleComplex));
987 */
988  /*
989  trace_file = fopen("AJETE/Qf", "w");
990  for (j = 0; j < N ; j++)
991  for (i = 0; i < N ; i++)
992  fprintf(trace_file,"%10d%10d%40.30e\n",i+1,j+1,A1[j*LDA1+i]);
993  fclose(trace_file);
994 */
995 
996  if(CHECK)
997  {
998  if(WANTZ!=1) JOBZ='N';
999  if(WANTZ==1) JOBZ='V';
1000  magma_zstedc_withZ(JOBZ, N, D2, E2, Q2, LDQ2);
1001  //magma_zstedc_withZ(MagmaNoVec, N, D2, E2, Q1, LDQ1);
1002  }
1003 
1004 
1005 
1006 
1007 #if defined(PRECISION_d)
1008 #if defined(CHECKEIG)
1009  if(CHECK)
1010  {
1011  /*************************************************
1012  * CALLING LAPACK on original matrix
1013  * ***********************************************/
1014  int LWORK=256*N;
1015  magmaDoubleComplex *WORK = (magmaDoubleComplex *) malloc( LWORK * sizeof(magmaDoubleComplex) );
1016  D1 = (double *) malloc( N * sizeof(double) );
1017  double *E1 = (double *) malloc( N * sizeof(double) );
1018  magmaDoubleComplex *ALPK = (magmaDoubleComplex *) malloc (N*LDAINIT*sizeof(magmaDoubleComplex));
1019  memcpy(ALPK, AINIT, N*LDAINIT*sizeof(magmaDoubleComplex));
1020 
1021  timelpk = magma_wtime();
1022  lapackf77_zhbtrd("N", "L", &N, &NB, ALPK, &LDAINIT, D1, E1, WORK, &N, WORK, &INFO);
1023  timelpk = magma_wtime() - timelpk;
1024  printf("\n");
1025  printf(" Time ZHBTRD-MKL-LAPACK : %lf N : %10d NB : %10d \n\n\n",timelpk, N, NB );
1026  /* call eigensolver */
1027  lapackf77_dsterf(&N, D1, E1, &INFO);
1028  /* ***********************************************/
1029  // call eigensolver
1030  //dsterf_( &N, D2, E2, &INFO);
1031  //magma_zstedc_withZ(JOBZ, N, D2, E2, Q1, LDQ1);
1032  // compare result
1033  cmp_vals(N, D1, D2, &nrmI, &nrm1, &nrm2);
1034 
1035  magmaDoubleComplex *WORKAJETER;
1036  double *RWORKAJETER, *RESU;
1037  WORKAJETER = (magmaDoubleComplex *) malloc( (2* N * N + N) * sizeof(magmaDoubleComplex) );
1038  RWORKAJETER = (double *) malloc( N * sizeof(double) );
1039  RESU = (double *) malloc(10*sizeof(double));
1040  int MATYPE;
1041  memset(RESU,0,10*sizeof(double));
1042 
1043 
1044  MATYPE=2;
1045  magmaDoubleComplex NOTHING=c_zero;
1046  zcheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &LDAINIT, &NOTHING, &NOTHING, D2 , D1, Q2, &LDQ2, WORKAJETER, RWORKAJETER, RESU );
1047 
1048 
1049 
1050 
1051 
1052 
1053 
1054  printf("\n");
1055  printf(" ================================================================================================================================== \n");
1056  printf(" ==> INFO voici threads=%d N=%d NB=%d BAND=%d WANTZ=%d\n",thread,N, NB, BAND, WANTZ);
1057  printf(" ================================================================================================================================== \n");
1058  printf(" ZHBTRD : %15s \n", "STATblgv9withQ ");
1059  printf(" ================================================================================================================================== \n");
1060  if(WANTZ==1)
1061  printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]);
1062  if(WANTZ==1)
1063  printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]);
1064  printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]);
1065  printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]);
1066  printf(" ================================================================================================================================== \n\n\n");
1067 
1068  printf(" ***********************************************************************************************************************************\n");
1069  printf(" Hello here are the norm Infinite (max)=%e norm one (sum)=%e norm2(sqrt)=%e \n",nrmI, nrm1, nrm2);
1070  printf(" ***********************************************************************************************************************************\n\n");
1071 
1072 
1073  }
1074 #endif
1075 #endif
1076 
1077 
1078  magma_free( dV2 );
1079  magma_free( da );
1080  magma_free_cpu(A2);
1081  magma_free_cpu(TAU);
1082  magma_free_cpu(V);
1083  magma_free_cpu(T);
1084 
1085  return 0;
1086 } /* END ZHETRD_BHE2TRC */
magmaFloatComplex * dQ1
volatile int barrier_out[MAX_THREADS_BLG]
#define min(a, b)
Definition: common_magma.h:86
#define lapackf77_dsterf
Definition: magma_lapack.h:33
void magma_device_sync()
void magma_zbulge_applyQ(magma_int_t WANTZ, char SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT, magmaDoubleComplex *dE, magma_int_t copytype)
void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV)
real_Double_t * timeblg
void magma_zstedc_withZ(char JOBZ, magma_int_t N, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
Definition: zbulge_aux.cpp:20
MAGMA_DLLPORT int MAGMA_CDECL pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start)(void *), void *arg)
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
magmaFloatComplex * TAU
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define magma_free(ptr)
Definition: magma.h:57
MAGMA_DLLPORT int MAGMA_CDECL pthread_join(pthread_t thread, void **value_ptr)
struct gbstrct_blg core_in_all
#define magma_zgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_z.h:705
int magma_int_t
Definition: magmablas.h:12
magmaFloatComplex * dT1
magmaFloatComplex * A
#define lapackf77_zlacpy
Definition: magma_zlapack.h:73
#define MagmaVec
Definition: magma_types.h:335
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
magmaFloatComplex * dT2
magmaFloatComplex * E_CPU
magma_int_t magma_zungqr_2stage_gpu(magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
#define PTHREAD_SCOPE_SYSTEM
#define Z(ix, iy)
Definition: dstedx.cpp:14
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_setscope(pthread_attr_t *attr, int scope)
#define MagmaNoVec
Definition: magma_types.h:334
#define MagmaLower
Definition: magma.h:62
#define CHECK
static void * applyQ_parallel_section(void *thread_id)
volatile int * ss_prog
void magma_setlapack_numthreads(magma_int_t num_threads)
void lapackf77_zhbtrd(const char *vect, const char *uplo, const magma_int_t *n, const magma_int_t *kd, magmaDoubleComplex *Ab, const magma_int_t *ldab, double *d, double *e, magmaDoubleComplex *Q, const magma_int_t *ldq, magmaDoubleComplex *work, magma_int_t *info)
#define TAU(m)
magmaFloatComplex * E
#define MAGMA_Z_ZERO
Definition: magma.h:131
void magma_zstedx_withZ(magma_int_t N, magma_int_t NE, double *D, double *E, magmaDoubleComplex *Z, magma_int_t LDZ)
Definition: zbulge_aux.cpp:67
void zcheck_eig_(char *JOBZ, magma_int_t *MATYPE, magma_int_t *N, magma_int_t *NB, magmaDoubleComplex *A, magma_int_t *LDA, double *AD, double *AE, double *D1, double *EIG, magmaDoubleComplex *Z, magma_int_t *LDZ, magmaDoubleComplex *WORK, double *RWORK, double *RESU)
magmaFloatComplex * V
magmaFloatComplex * T
double magma_wtime(void)
Definition: timer.cpp:110
static void * parallel_section(void *thread_id)
#define MAGMA_SUCCESS
Definition: magma.h:106
#define dQ2(id)
Definition: dlaex3_m.cpp:21
magmaFloatComplex * dE
MAGMA_DLLPORT int MAGMA_CDECL pthread_setconcurrency(int level)
#define MAGMA_Z_ONE
Definition: magma.h:132
real_Double_t * timeaplQ
double real_Double_t
Definition: magma_types.h:27
void cmp_vals(int n, double *wr1, double *wr2, double *nrmI, double *nrm1, double *nrm2)
#define MagmaNoTrans
Definition: magma.h:57
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
volatile int barrier_in[MAX_THREADS_BLG]
#define max(a, b)
Definition: common_magma.h:82
magma_err_t magma_free_cpu(void *ptr)
magmaFloatComplex * dV2
#define MAX_THREADS_BLG
#define V(m)
#define MAGMA_Z_REAL(a)
Definition: magma.h:124
MAGMA_DLLPORT int MAGMA_CDECL pthread_attr_init(pthread_attr_t *attr)
int pthread_attr_t
volatile int * ss_prog

Here is the call graph for this function:

void magma_zstedc_withZ ( char  JOBZ,
magma_int_t  N,
double *  D,
double *  E,
magmaDoubleComplex *  Z,
magma_int_t  LDZ 
)

Definition at line 20 of file zbulge_aux.cpp.

20  {
21  magmaDoubleComplex *WORK;
22  double *RWORK;
23  magma_int_t *IWORK;
24  magma_int_t LWORK, LIWORK, LRWORK;
25  magma_int_t INFO;
26 
27  if(JOBZ=='V'){
28  LWORK = N*N;
29  LRWORK = 1 + 3*N + 3*N*((magma_int_t)log2(N)+1) + 4*N*N+ 256*N;
30  LIWORK = 6 + 6*N + 6*N*((magma_int_t)log2(N)+1) + 256*N;
31  }else if(JOBZ=='I'){
32  LWORK = N;
33  LRWORK = 2*N*N+4*N+1+256*N;
34  LIWORK = 256*N;
35  }else if(JOBZ=='N'){
36  LWORK = N;
37  LRWORK = 256*N+1;
38  LIWORK = 256*N;
39  }else{
40  printf("ERROR JOBZ %c\n",JOBZ);
41  exit(-1);
42  }
43 
44  RWORK = (double*) malloc( LRWORK*sizeof( double) );
45  WORK = (magmaDoubleComplex*) malloc( LWORK*sizeof( magmaDoubleComplex) );
46  IWORK = (magma_int_t*) malloc( LIWORK*sizeof( magma_int_t) );
47 
48  lapackf77_zstedc(&JOBZ, &N, D, E, Z, &LDZ, WORK, &LWORK, RWORK, &LRWORK, IWORK, &LIWORK, &INFO);
49 
50  if(INFO!=0){
51  printf("=================================================\n");
52  printf("ZSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO);
53  printf("=================================================\n");
54  //assert(INFO==0);
55  }
56 
57 
58  magma_free_cpu( IWORK );
59  magma_free_cpu( WORK );
60  magma_free_cpu( RWORK );
61 }
int magma_int_t
Definition: magmablas.h:12
#define Z(ix, iy)
Definition: dstedx.cpp:14
#define lapackf77_zstedc
Definition: magma_zlapack.h:95
#define E(m, n)
magma_err_t magma_free_cpu(void *ptr)
void magma_zstedx_withZ ( magma_int_t  N,
magma_int_t  NE,
double *  D,
double *  E,
magmaDoubleComplex *  Z,
magma_int_t  LDZ 
)

Definition at line 67 of file zbulge_aux.cpp.

67  {
68  double *RWORK;
69  double *dwork;
70  magma_int_t *IWORK;
71  magma_int_t LWORK, LIWORK, LRWORK;
72  magma_int_t INFO;
73 
74  LWORK = N;
75  LRWORK = 2*N*N+4*N+1+256*N;
76  LIWORK = 256*N;
77 
78  RWORK = (double*) malloc( LRWORK*sizeof( double) );
79  IWORK = (magma_int_t*) malloc( LIWORK*sizeof( magma_int_t) );
80 
81  if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*N*(N/2 + 1) )) {
82  printf("=================================================\n");
83  printf("ZSTEDC ERROR OCCURED IN CUDAMALLOC\n");
84  printf("=================================================\n");
85  return;
86  }
87  printf("using magma_zstedx\n");
88 
89 #ifdef ENABLE_TIMER
90  magma_timestr_t start, end;
91  start = get_current_time();
92 #endif
93 
94  char job = 'I';
95 
96  if(NE==N)
97  job = 'A';
98 
99  magma_zstedx(job, N, 0.,0., 1, NE, D, E, Z, LDZ, RWORK, LRWORK, IWORK, LIWORK, dwork, &INFO);
100 
101  if(INFO!=0){
102  printf("=================================================\n");
103  printf("ZSTEDC ERROR OCCURED. HERE IS INFO %d \n ", (int) INFO);
104  printf("=================================================\n");
105  //assert(INFO==0);
106  }
107 
108 #ifdef ENABLE_TIMER
109  end = get_current_time();
110  printf("time zstevx = %6.2f\n", GetTimerValue(start,end)/1000.);
111 #endif
112 
113  magma_free( dwork );
114  magma_free_cpu( IWORK );
115  magma_free_cpu( RWORK );
116 }
static magma_err_t magma_dmalloc(magmaDouble_ptr *ptrPtr, size_t n)
Definition: magma.h:78
#define magma_free(ptr)
Definition: magma.h:57
int magma_int_t
Definition: magmablas.h:12
#define dwork(dev, i, j)
#define Z(ix, iy)
Definition: dstedx.cpp:14
#define MAGMA_SUCCESS
Definition: magma.h:106
#define E(m, n)
magma_int_t magma_zstedx(char range, magma_int_t n, double vl, double vu, magma_int_t il, magma_int_t iu, double *D, double *E, magmaDoubleComplex *Z, magma_int_t ldz, double *rwork, magma_int_t ldrwork, magma_int_t *iwork, magma_int_t liwork, double *dwork, magma_int_t *info)
Definition: zstedx.cpp:16
double GetTimerValue(magma_timestr_t start, magma_timestr_t end)
Definition: timer.cpp:94
magma_err_t magma_free_cpu(void *ptr)
magma_timestr_t get_current_time(void)
Definition: timer.cpp:76
void magma_ztrdtype1cbHLsym_withQ ( magma_int_t  N,
magma_int_t  NB,
magmaDoubleComplex *  A,
magma_int_t  LDA,
magmaDoubleComplex *  V,
magmaDoubleComplex *  TAU,
magma_int_t  st,
magma_int_t  ed,
magma_int_t  sweep,
magma_int_t  Vblksiz 
)

Definition at line 90 of file zbulge_kernel.cpp.

References A, findVTpos(), lapackf77_zlarfg, magma_free_cpu(), MAGMA_Z_ONE, magma_zlarfxsym(), TAU, and V.

90  {
91  //magma_int_t J1, J2, J3, i, j;
92  magma_int_t len, LDX;
93  magma_int_t IONE=1;
94  magma_int_t blkid, vpos, taupos, tpos;
95  //magmaDoubleComplex conjtmp;
96  magmaDoubleComplex Z_ONE = MAGMA_Z_ONE;
97  magmaDoubleComplex *WORK = (magmaDoubleComplex *) malloc( N * sizeof(magmaDoubleComplex) );
98 
99 
100  findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
101  //printf("voici vpos %d taupos %d tpos %d blkid %d \n", vpos, taupos, tpos, blkid);
102  LDX = LDA-1;
103  len = ed-st+1;
104  *V(vpos) = Z_ONE;
105  memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(magmaDoubleComplex));
106  memset(A(st+1, st-1), 0, (len-1)*sizeof(magmaDoubleComplex));
107  /* Eliminate the col at st-1 */
108  lapackf77_zlarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) );
109  /* apply left and right on A(st:ed,st:ed)*/
110  magma_zlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
111  //conjtmp = MAGMA_Z_CNJG(*TAU(taupos));
112  //lapackf77_zlarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_Z_CNJG(*TAU(taupos)))
113  magma_free_cpu(WORK);
114 }
int magma_int_t
Definition: magmablas.h:12
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define TAU(m)
#define A(i, j)
Definition: cprint.cpp:16
#define MAGMA_Z_ONE
Definition: magma.h:132
void magma_zlarfxsym(magma_int_t N, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU)
magma_err_t magma_free_cpu(void *ptr)
#define lapackf77_zlarfg
Definition: magma_zlapack.h:79
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_ztrdtype2cbHLsym_withQ ( magma_int_t  N,
magma_int_t  NB,
magmaDoubleComplex *  A,
magma_int_t  LDA,
magmaDoubleComplex *  V,
magmaDoubleComplex *  TAU,
magma_int_t  st,
magma_int_t  ed,
magma_int_t  sweep,
magma_int_t  Vblksiz 
)

Definition at line 127 of file zbulge_kernel.cpp.

References A, findVTpos(), lapackf77_zlarfg, lapackf77_zlarfx(), MAGMA_Z_CNJG, MAGMA_Z_ONE, min, TAU, and V.

127  {
128  magma_int_t J1, J2, len, lem, LDX;
129  //magma_int_t i, j;
130  magma_int_t IONE=1;
131  magma_int_t blkid, vpos, taupos, tpos;
132  magmaDoubleComplex conjtmp;
133  magmaDoubleComplex Z_ONE = MAGMA_Z_ONE;
134  //magmaDoubleComplex WORK[NB];
135  magmaDoubleComplex *WORK = (magmaDoubleComplex *) malloc( NB * sizeof(magmaDoubleComplex) );
136 
137 
138  findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
139  LDX = LDA-1;
140  J1 = ed+1;
141  J2 = min(ed+NB,N);
142  len = ed-st+1;
143  lem = J2-J1+1;
144  if(lem>0){
145  /* apply remaining right commming from the top block */
146  lapackf77_zlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
147  }
148  if(lem>1){
149  findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
150  /* remove the first column of the created bulge */
151  *V(vpos) = Z_ONE;
152  memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(magmaDoubleComplex));
153  memset(A(J1+1, st),0,(lem-1)*sizeof(magmaDoubleComplex));
154  /* Eliminate the col at st */
155  lapackf77_zlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
156  /* apply left on A(J1:J2,st+1:ed) */
157  len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved;*/
158  conjtmp = MAGMA_Z_CNJG(*TAU(taupos));
159  lapackf77_zlarfx("L", &lem, &len, V(vpos), &conjtmp, A(J1, st+1), &LDX, WORK);
160  }
161  free (WORK);
162 }
#define min(a, b)
Definition: common_magma.h:86
int magma_int_t
Definition: magmablas.h:12
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define TAU(m)
void lapackf77_zlarfx(const char *side, const magma_int_t *m, const magma_int_t *n, magmaDoubleComplex *V, magmaDoubleComplex *tau, magmaDoubleComplex *C, const magma_int_t *ldc, magmaDoubleComplex *work)
#define A(i, j)
Definition: cprint.cpp:16
#define MAGMA_Z_ONE
Definition: magma.h:132
#define MAGMA_Z_CNJG(v, t)
Definition: magma.h:120
#define lapackf77_zlarfg
Definition: magma_zlapack.h:79
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_ztrdtype3cbHLsym_withQ ( magma_int_t  N,
magma_int_t  NB,
magmaDoubleComplex *  A,
magma_int_t  LDA,
magmaDoubleComplex *  V,
magmaDoubleComplex *  TAU,
magma_int_t  st,
magma_int_t  ed,
magma_int_t  sweep,
magma_int_t  Vblksiz 
)

Definition at line 175 of file zbulge_kernel.cpp.

References A, findVTpos(), magma_free_cpu(), magma_zlarfxsym(), TAU, and V.

175  {
176  //magma_int_t J1, J2, J3, i, j;
177  magma_int_t len, LDX;
178  //magma_int_t IONE=1;
179  magma_int_t blkid, vpos, taupos, tpos;
180  //magmaDoubleComplex conjtmp;
181  magmaDoubleComplex *WORK = (magmaDoubleComplex *) malloc( N * sizeof(magmaDoubleComplex) );
182 
183 
184  findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
185  LDX = LDA-1;
186  len = ed-st+1;
187 
188  /* apply left and right on A(st:ed,st:ed)*/
189  magma_zlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
190  //conjtmp = MAGMA_Z_CNJG(*TAU(taupos));
191  //lapackf77_zlarfy("L", &len, V(vpos), &IONE, &(MAGMA_Z_CNJG(*TAU(taupos))), A(st,st), &LDX, WORK);
192  magma_free_cpu(WORK);
193 }
int magma_int_t
Definition: magmablas.h:12
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define TAU(m)
#define A(i, j)
Definition: cprint.cpp:16
void magma_zlarfxsym(magma_int_t N, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU)
magma_err_t magma_free_cpu(void *ptr)
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zungqr_2stage_gpu ( magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaDoubleComplex *  da,
magma_int_t  ldda,
magmaDoubleComplex *  tau,
magmaDoubleComplex *  dT,
magma_int_t  nb,
magma_int_t info 
)

Definition at line 14 of file zungqr_2stage_gpu.cpp.

References __func__, da_ref, dwork, magma_free, MAGMA_SUCCESS, magma_xerbla(), magma_zlarfb_gpu(), magma_zmalloc(), magmablas_zlaset(), magmablas_zlaset_identity(), MagmaColumnwise, MagmaForward, MagmaLeft, MagmaNoTrans, MagmaUpperLower, max, min, and t_ref.

18 {
19 /* -- MAGMA (version 1.4.0) --
20  Univ. of Tennessee, Knoxville
21  Univ. of California, Berkeley
22  Univ. of Colorado, Denver
23  August 2013
24 
25  Purpose
26  =======
27  ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns,
28  which is defined as the first N columns of a product of K elementary
29  reflectors of order M
30 
31  Q = H(1) H(2) . . . H(k)
32 
33  as returned by ZGEQRF_GPU.
34 
35  Arguments
36  =========
37  M (input) INTEGER
38  The number of rows of the matrix Q. M >= 0.
39 
40  N (input) INTEGER
41  The number of columns of the matrix Q. M >= N >= 0.
42 
43  K (input) INTEGER
44  The number of elementary reflectors whose product defines the
45  matrix Q. N >= K >= 0.
46 
47  DA (input/output) COMPLEX_16 array A on the GPU device,
48  dimension (LDDA,N). On entry, the i-th column must contain
49  the vector which defines the elementary reflector H(i), for
50  i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k
51  columns of its array argument A.
52  On exit, the M-by-N matrix Q.
53 
54  LDDA (input) INTEGER
55  The first dimension of the array A. LDDA >= max(1,M).
56 
57  TAU (input) COMPLEX_16 array, dimension (K)
58  TAU(i) must contain the scalar factor of the elementary
59  reflector H(i), as returned by ZGEQRF_GPU.
60 
61  DT (input) COMPLEX_16 work space array on the GPU device,
62  dimension (MIN(M, N) )*NB.
63  This must be the 6th argument of magma_zgeqrf_gpu
64  [ note that if N here is bigger than N in magma_zgeqrf_gpu,
65  the workspace requirement DT in magma_zgeqrf_gpu must be
66  as specified in this routine ].
67 
68  NB (input) INTEGER
69  This is the block size used in ZGEQRF_GPU, and correspondingly
70  the size of the T matrices, used in the factorization, and
71  stored in DT.
72 
73  INFO (output) INTEGER
74  = 0: successful exit
75  < 0: if INFO = -i, the i-th argument has an illegal value
76  ===================================================================== */
77 
78  #define da_ref(a_1,a_2) (da+(a_2)*(ldda) + (a_1))
79  #define t_ref(a_1) (dT+(a_1)*nb)
80 
81  magma_int_t i__1, i__2, i__3;
82  //magma_int_t lwork;
83  magma_int_t i, ib, ki, kk; //, iinfo;
84  //magma_int_t lddwork = min(m, n);
85  //magmaDoubleComplex *work, *panel;
86  magmaDoubleComplex *dwork;
87  //magma_queue_t stream[2];
88  magma_int_t ldt=nb; // need to be an input parameter
89 
90  *info = 0;
91  if (m < 0) {
92  *info = -1;
93  } else if ((n < 0) || (n > m)) {
94  *info = -2;
95  } else if ((k < 0) || (k > n)) {
96  *info = -3;
97  } else if (ldda < max(1,m)) {
98  *info = -5;
99  }
100  if (*info != 0) {
101  magma_xerbla( __func__, -(*info) );
102  return *info;
103  }
104 
105  if (n <= 0)
106  return *info;
107 
108  if(MAGMA_SUCCESS != magma_zmalloc( &dwork, n*nb )) {
109  printf ("!!!! zungqr_2stage magma_alloc failed for: dwork\n" );
110  exit(-1);
111  }
112 
113  if ( (nb > 1) && (nb < k) ) {
114  /* Use blocked code after the last block.
115  The first kk columns are handled by the block method.
116  ki is start of 2nd-to-last block. */
117  ki = (k - nb - 1) / nb * nb;
118  kk = min(k, ki + nb);
119 
120  /* Set A(1:kk,kk+1:n) to zero. */
121  magmablas_zlaset(MagmaUpperLower, kk, n-kk, da_ref(0,kk), ldda);
122  /* A(kk+1:m, kk+1:n) = I */
123  magmablas_zlaset_identity(m-kk, n-kk, da_ref(kk,kk), ldda);
124  }
125  else {
126  ki = 0;
127  kk = 0;
128  }
129 
130  /* Allocate work space on CPU in pinned memory */
131  //lwork = (n+m) * nb;
132  //if (kk < n)
133  // lwork = max(lwork, n * nb + (m-kk)*(n-kk));
134 
135  //if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, (lwork) )) {
136  // *info = MAGMA_ERR_HOST_ALLOC;
137  // return *info;
138  //}
139  //panel = work + n * nb;
140 
141  //magma_queue_create( &stream[0] );
142  //magma_queue_create( &stream[1] );
143  /* Use unblocked code for the last or only block. */
144  if (kk < n) {
145  i__1 = m - kk;
146  i__2 = n - kk;
147  i__3 = k - kk;
148  //cublasGetMatrix(i__1, i__2, sizeof(magmaDoubleComplex),
149  // da_ref(kk, kk), ldda, panel, i__1);
150  //lapackf77_zungqr(&i__1, &i__2, &i__3, panel, &i__1, &tau[kk],
151  // work, &lwork, &iinfo);
152  //
153  //cublasSetMatrix(i__1, i__2, sizeof(magmaDoubleComplex),
154  // panel, i__1, da_ref(kk, kk), ldda);
155 
157  i__1, i__2, i__3,
158  da_ref(kk, kk-nb), ldda, t_ref(kk-nb), ldt,
159  da_ref(kk, kk), ldda, dwork, i__2);
160 
161  //magmablas_zlaset(MagmaUpperLower, kk-nb, nb, da_ref(0,kk-nb), ldda);
162  //magmablas_zlaset_identity(m-(kk-nb), nb, da_ref(kk-nb,kk-nb), ldda);
163  }
164 
165  if (kk > 0) {
166  /* Use blocked code */
167  for (i = ki; i >= nb; i-=nb) {
168  ib = min(nb, k - i);
169  /* Send current panel to the CPU for update */
170  i__2 = m - i;
171  //cudaMemcpy2DAsync(panel, i__2 * sizeof(magmaDoubleComplex),
172  // da_ref(i,i), ldda * sizeof(magmaDoubleComplex),
173  // sizeof(magmaDoubleComplex)*i__2, ib,
174  // cudaMemcpyDeviceToHost,stream[0]);
175  if (i + ib < n) {
176  /* Apply H to A(i:m,i+ib:n) from the left */
177  i__3 = n - i;
178 
180  magmablas_zlaset_identity(m-i, ib, da_ref(i,i), ldda);
181 
183  i__2, i__3, ib,
184  da_ref(i, i-nb), ldda, t_ref(i-nb), ldt,
185  da_ref(i, i), ldda, dwork, i__3);
186  }
187 
188  /* Apply H to rows i:m of current block on the CPU */
189  //magma_queue_sync( stream[0] );
190  //lapackf77_zungqr(&i__2, &ib, &ib, panel, &i__2, &tau[i],
191  // work, &lwork, &iinfo);
192  //cudaMemcpy2DAsync(da_ref(i,i), ldda * sizeof(magmaDoubleComplex),
193  // panel, i__2 * sizeof(magmaDoubleComplex),
194  // sizeof(magmaDoubleComplex)*i__2, ib,
195  // cudaMemcpyHostToDevice,stream[1]);
196 
197  /* Set rows 1:i-1 of current block to zero */
198  i__2 = i + ib;
199  //magmablas_zlaset(MagmaUpperLower, i-ib, ib, da_ref(0,i-ib), ldda);
200  //magmablas_zlaset_identity(m-(i-ib), ib, da_ref(i-ib,i-ib), ldda);
201  }
202  }
203 
204  magmablas_zlaset_identity(m, nb, da_ref(0,0), ldda);
205 
206  magma_free( dwork );
207  //magma_free_pinned( work );
208  //magma_queue_destroy( stream[0] );
209  //magma_queue_destroy( stream[1] );
210 
211  return *info;
212 } /* magma_zungqr_gpu */
#define MagmaUpperLower
Definition: magma.h:63
#define min(a, b)
Definition: common_magma.h:86
#define MagmaLeft
Definition: magma.h:68
#define __func__
Definition: common_magma.h:65
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
void magmablas_zlaset(magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda)
#define magma_free(ptr)
Definition: magma.h:57
int magma_int_t
Definition: magmablas.h:12
void magmablas_zlaset_identity(magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda)
#define da_ref(a_1, a_2)
magma_int_t magma_zlarfb_gpu(char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *dv, magma_int_t ldv, cuDoubleComplex *dt, magma_int_t ldt, cuDoubleComplex *dc, magma_int_t ldc, cuDoubleComplex *dowrk, magma_int_t ldwork)
Definition: zlarfb_gpu.cpp:21
#define dwork(dev, i, j)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
magma_int_t ldda
#define MagmaForward
Definition: magma.h:71
#define MAGMA_SUCCESS
Definition: magma.h:106
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
#define t_ref(a_1)
#define MagmaColumnwise
Definition: magma.h:74

Here is the call graph for this function:

static void * parallel_section ( void *  thread_id)
static

Definition at line 1095 of file zhetrd_bhe2trc_v3.cpp.

References barrier(), core_event_endblg, core_event_startblg, core_in_all, core_log_eventblg, gbstrct_blg::cores_num, gbstrct_blg::dQ1, gbstrct_blg::dT1, gbstrct_blg::locores_num, magma_device_sync(), magma_wtime(), magma_zungqr_2stage_gpu(), gbstrct_blg::N, gbstrct_blg::NB, gbstrct_blg::overlapQ1, sys_corenbr, tile_bulge_computeT_parallel(), tile_bulge_parallel(), gbstrct_blg::timeblg, and gbstrct_blg::WANTZ.

1096 {
1097  int my_core_id = *((int*)thread_id);
1098  int allcores_num = core_in_all.cores_num;
1099  int locores_num = core_in_all.locores_num;
1100  int overlapQ1 = core_in_all.overlapQ1;
1101  magmaDoubleComplex *dQ1 = core_in_all.dQ1;
1102  magmaDoubleComplex *dT1 = core_in_all.dT1;
1103  int N = core_in_all.N;
1104  int NB = core_in_all.NB;
1105  int WANTZ = core_in_all.WANTZ;
1106  static volatile int sys_corenbr = 1;
1107  magma_int_t INFO, i, my_newcore_id, lastcoreid;
1108  magmaDoubleComplex *NOTUSED;
1109  real_Double_t timeB=0.0, timeT=0.0, timeall=0.0, timeaplQ1=0.0;
1110 
1111 #if defined(MAGMA_SETAFFINITY)
1112  // bind threads
1113  cpu_set_t set;
1114  // bind threads
1115  CPU_ZERO( &set );
1116  CPU_SET( my_core_id, &set );
1117  sched_setaffinity( 0, sizeof(set), &set) ;
1118 #endif
1119 
1120  log_eventsblg = 1;
1121  core_event_startblg(my_core_id);
1122  barrier(my_core_id, allcores_num);
1123  core_event_endblg(my_core_id);
1124  core_log_eventblg(0x000000, my_core_id);
1125 
1126  if((my_core_id == 0)&&(overlapQ1==0)){
1127  if(locores_num!=allcores_num)
1128  printf("\n\n\n WARNING **** not all cpu used on a version where overlapQ1 is disabled\n\n\n");
1129  }
1130 
1131  // timing
1132  if (my_core_id == 0){
1133  timeall = magma_wtime();
1134  }
1135 
1136 
1137  /*################################################
1138  * WANTZ > 0
1139  *################################################*/
1140  if((WANTZ>0))
1141  {
1142  /* compute the Q1 overlapped with the bulge chasing+T.
1143  * if cores_cum=1 meaning that only one thread is working
1144  * meaning the whole code is sequential so nothing special
1145  * to be done, i will call GPU then i will call bulgechasing.
1146  * if not the GPU is choosed to run on last coreid to avoid change
1147  * in the barrier and the original bulge code. However we discover
1148  * that the performance of GPU when going to last core is bad.
1149  * it look like the cuda is reinitializing the GPU with the new core.
1150  * so we want to keep the GPU with the main core =0 and try to change id's.
1151  * However to avoid a lot of change in the code because the bulge
1152  * and the T and the barrier are also based on core number "0"
1153  * so we will cheat giving the bulge the remaining cores[2-allcores_num]
1154  * saying that they start at core "0" which is core "1" so making shift
1155  * and give the GPU the main original core "0" saying that this is last_core.
1156  * so I will create a newcoreid and give original core "0" and id=allcores_num
1157  * and shift the remaining cores[2-allcores_num] back by one.
1158  * */
1159  /************************************************
1160  * only one core is running ==> code is sequential
1161  ************************************************/
1162  if(allcores_num==1)
1163  {
1164  my_newcore_id = my_core_id;
1165  //=========================
1166  // compute Q1
1167  //=========================
1168  if(overlapQ1==1){
1170  timeaplQ1 = magma_wtime();
1171  magma_zungqr_2stage_gpu(N, N, N, dQ1, N, NOTUSED, dT1, NB, &INFO);
1173  //cublasGetMatrix( N, LDA1, sizeof(magmaDoubleComplex), da, LDA1, A1, LDA1);
1174  timeaplQ1 = magma_wtime()-timeaplQ1;
1175  printf(" Finish applyQ1 timing= %lf \n" ,timeaplQ1);
1176  }
1177 
1178  //=========================
1179  // bulge chasing
1180  //=========================
1181  if(my_newcore_id == 0)timeB = magma_wtime();
1182  tile_bulge_parallel(my_newcore_id);
1183  barrier(my_newcore_id, locores_num);
1184  if(my_newcore_id == 0){
1185  timeB = magma_wtime()-timeB;
1186  printf(" Finish BULGE timing= %lf \n" ,timeB);
1187  }
1188 
1189  //=========================
1190  // compute the T's to be used when applying Q2
1191  //=========================
1192  if(my_newcore_id == 0)timeT = magma_wtime();
1193  tile_bulge_computeT_parallel(my_newcore_id);
1194  barrier(my_newcore_id, locores_num);
1195  // timing
1196  if (my_newcore_id == 0){
1197  timeT = magma_wtime()-timeT;
1198  printf(" Finish T's timing= %lf \n" ,timeT);
1199  }
1200  /************************************************
1201  * more than one core
1202  ************************************************/
1203  }else{
1204  if(overlapQ1==1)
1205  {
1206  lastcoreid = allcores_num-1;
1207  // the the coreid "0" becomes last_one allcores_num-1
1208  // and "1" becomes "0" and so on
1209  if(my_core_id==0){
1210  my_newcore_id = lastcoreid; // giving it last core id
1211  }else{
1212  my_newcore_id = my_core_id-1;
1213  }
1214  }else{
1215  lastcoreid = -1;
1216  my_newcore_id = my_core_id;
1217  }
1218 
1219 
1220  /* I am the last core in the new indexing and the original core=0 */
1221  if(my_newcore_id==lastcoreid)
1222  {
1223  //=============================================
1224  // compute Q1 on last newcoreid
1225  //=============================================
1227  timeaplQ1 = magma_wtime();
1228  magma_zungqr_2stage_gpu(N, N, N, dQ1, N, NOTUSED, dT1, NB, &INFO);
1230  //cublasGetMatrix( N, LDA1, sizeof(magmaDoubleComplex), da, LDA1, A1, LDA1);
1231  timeaplQ1 = magma_wtime()-timeaplQ1;
1232  printf(" Finish applyQ1 timing= %lf \n" ,timeaplQ1);
1233  /* I am one of the remaining cores*/
1234  }else{
1235  //=========================
1236  // bulge chasing
1237  //=========================
1238  if(my_newcore_id == 0)timeB = magma_wtime();
1239  tile_bulge_parallel(my_newcore_id);
1240  barrier(my_newcore_id, locores_num);
1241  if(my_newcore_id == 0){
1242  timeB = magma_wtime()-timeB;
1243  printf(" Finish BULGE timing= %lf \n" ,timeB);
1244  }
1245 
1246  //=========================
1247  // compute the T's to be used when applying Q2
1248  //=========================
1249  if(my_newcore_id == 0)timeT = magma_wtime();
1250  tile_bulge_computeT_parallel(my_newcore_id);
1251  barrier(my_newcore_id, locores_num);
1252  // timing
1253  if (my_newcore_id == 0){
1254  timeT = magma_wtime()-timeT;
1255  printf(" Finish T's timing= %lf \n" ,timeT);
1256  }
1257  } // END if my_newcore_id==allcores_num-1
1258 
1259  } // END of more than one core
1260 
1261  /*################################################
1262  * WANTZ = 0
1263  *################################################*/
1264  }else{
1265  my_newcore_id = my_core_id;
1266  //=========================
1267  // bulge chasing
1268  //=========================
1269  if(my_newcore_id == 0)timeB = magma_wtime();
1270  tile_bulge_parallel(my_newcore_id);
1271  barrier(my_newcore_id, locores_num);
1272  if(my_newcore_id == 0){
1273  timeB = magma_wtime()-timeB;
1274  printf(" Finish BULGE timing= %lf \n" ,timeB);
1275  }
1276  }
1277  /*################################################
1278  * END of WANTZ
1279  *################################################*/
1280 
1281 
1282 
1283 
1284  // put a barrier on all the cores to be sure that
1285  // both [1:cores_num-1] working for bulge+T and cores_num
1286  // working for Q1 have finish.
1287  barrier(my_core_id, allcores_num);
1288 
1289 
1290  // timing
1291  if (my_core_id == 0){
1292  timeall = magma_wtime()-timeall;
1293  *(core_in_all.timeblg) = timeall;
1294  }
1295 
1296 #if defined(MAGMA_SETAFFINITY)
1297  // unbind threads
1298  sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
1299  CPU_ZERO( &set );
1300  for(i=0; i<sys_corenbr; i++)
1301  CPU_SET( i, &set );
1302  sched_setaffinity( 0, sizeof(set), &set) ;
1303 #endif
1304 
1305  return 0;
1306 }
magmaFloatComplex * dQ1
void magma_device_sync()
real_Double_t * timeblg
#define core_event_startblg(my_core_id)
struct gbstrct_blg core_in_all
int magma_int_t
Definition: magmablas.h:12
magmaFloatComplex * dT1
static void barrier(int my_core_id, int cores_num)
#define core_log_eventblg(event, my_core_id)
magma_int_t magma_zungqr_2stage_gpu(magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
int log_eventsblg
#define core_event_endblg(my_core_id)
double magma_wtime(void)
Definition: timer.cpp:110
static void tile_bulge_computeT_parallel(int my_core_id)
static void tile_bulge_parallel(int my_core_id)
double real_Double_t
Definition: magma_types.h:27
static volatile int sys_corenbr
Definition: quarkos.c:68

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t plasma_ceildiv ( magma_int_t  a,
magma_int_t  b 
)

Definition at line 221 of file bulge_auxiliary.cpp.

References magma_ceildiv().

222  {
223  return magma_ceildiv(a,b);
224  }
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16

Here is the call graph for this function:

Here is the caller graph for this function:

void tile_bulge_applyQ_cpu ( magma_int_t  WANTZ,
char  SIDE,
magma_int_t  N,
magma_int_t  NB,
magma_int_t  Vblksiz,
magmaDoubleComplex *  E,
magma_int_t  LDE,
magmaDoubleComplex *  V,
magmaDoubleComplex *  TAU,
magmaDoubleComplex *  T,
magma_int_t INFO 
)
static void tile_bulge_applyQ_parallel ( int  my_core_id)
static

Definition at line 1838 of file zhetrd_bhe2trc_v3.cpp.

References core_event_endblg, core_event_startblg, core_in_all, core_log_eventblg, gbstrct_blg::cores_num, E, gbstrct_blg::E_CPU, findVTpos(), lapackf77_zlarfb, gbstrct_blg::LDE, gbstrct_blg::locores_num, LOGQ, max, min, gbstrct_blg::N, gbstrct_blg::N_CPU, gbstrct_blg::NB, gbstrct_blg::NE, plasma_ceildiv(), gbstrct_blg::SIDE, T, gbstrct_blg::T, TAU, gbstrct_blg::TAU, V, gbstrct_blg::V, gbstrct_blg::Vblksiz, and gbstrct_blg::WANTZ.

1839 {
1840  magma_int_t INFO;
1841  /* CHANGE HERE to SWITCH FROM local_cores_num to cores_num
1842  * or do it at the startup by putting core_in_all.es_num=cores_num*/
1843  magma_int_t cores_num = core_in_all.locores_num;
1844  magmaDoubleComplex *E = core_in_all.E_CPU;
1845 
1846  magmaDoubleComplex *T = core_in_all.T;
1847  magmaDoubleComplex *V = core_in_all.V;
1848  magmaDoubleComplex *TAU = core_in_all.TAU;
1849  magma_int_t N_CPU = core_in_all.N_CPU;
1850  magma_int_t NE = core_in_all.NE;
1852  magma_int_t NB = core_in_all.NB;
1853  magma_int_t Vblksiz = core_in_all.Vblksiz;
1854  magma_int_t LDE = core_in_all.LDE;
1855  char SIDE = core_in_all.SIDE;
1856  magma_int_t WANTZ = core_in_all.WANTZ;
1857 
1858  //%===========================
1859  //% local variables
1860  //%===========================
1861  magma_int_t LDT,LDV,blklen,firstcolj;
1862  magma_int_t bg, nbGblk,rownbm, k, m, n;
1863  magma_int_t st,ed,fst,vlen,vnb,colj,len;
1864  magma_int_t blkid, vpos,taupos,tpos;
1865  magmaDoubleComplex *WORK;
1866  magma_int_t LWORK;
1867  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
1868  magma_int_t nbgr, colst, coled, versionL,versionR;
1869  magma_int_t chunkid,nbchunk,colpercore,corest,corelen;
1870  magma_int_t coreid,mycoresnb,maxrequiredcores;
1871 
1872 
1873  if(N<=0)
1874  return ;
1875  if(NE<=0)
1876  return ;
1877  if(N_CPU<=0)
1878  return ;
1879 
1880 
1881 
1882  INFO=0;
1883  versionL = 114;
1884  versionR = 92;
1885  LDT = Vblksiz;
1886  LDV = NB+Vblksiz-1;
1887  blklen = LDV*Vblksiz;
1888  nbGblk = plasma_ceildiv((N-1),Vblksiz);
1889  //LWORK = 2*N*max(Vblksiz,64);
1890  //WORK = (magmaDoubleComplex *) malloc (LWORK*sizeof(magmaDoubleComplex));
1891 
1892 
1893  /* use colpercore = N/cores_num; :if i want to split E into
1894  * cores_num chunk so I will have large chunk for each core.
1895  * However I prefer to split E into chunk of small size where
1896  * I guarantee that blas3 occur and the size of chunk remain into
1897  * cache, I think it is better. than I will have different chunk of
1898  * small sizeper coreand i will do each chunk till the end and
1899  * then move tothe second one for data locality
1900  *
1901  * version v1: for each chunck it apply all the V's then move to
1902  * the other chunck. the locality here inside each
1903  * chunck meaning that thread t apply V_k then move
1904  * to V_k+1 which overlap with V_k meaning that the
1905  * E_k+1 overlap with E_k. so here is the
1906  * locality however thread t had to read V_k+1 and
1907  * T_k+1 at each apply. note that all thread if they
1908  * run at same speed they might reading the same V_k
1909  * and T_k at the same time.
1910  *
1911  * version v2: for each V_k and T_k thread t apply those V_k and
1912  * T_k to E_k for all its chunck, then move to V_k+1
1913  * T_k+1 and apply them to E_k+1 on all the chunck,
1914  * and so on. the difference is that, thread keep V_k
1915  * and T_K while move over the E_k.
1916  * both version look like similar in perf.
1917  * */
1918  colpercore = min(NB,120); //colpercore = N make the code sequential running on thread=0;
1919  //colpercore = N/cores_num;
1920 
1921  colpercore = plasma_ceildiv(N_CPU,cores_num);
1922  if(colpercore>1000)
1923  colpercore = plasma_ceildiv(colpercore,10);
1924  else if(colpercore>800)
1925  colpercore = plasma_ceildiv(colpercore,8);
1926  else if(colpercore>600)
1927  colpercore = plasma_ceildiv(colpercore,6);
1928  else if(colpercore>400)
1929  colpercore = plasma_ceildiv(colpercore,4);
1930  else if(colpercore>200)
1931  colpercore = plasma_ceildiv(colpercore,2);
1932  if(colpercore>200)colpercore=120;
1933 
1934  LWORK = 2*colpercore*max(Vblksiz,64);
1935  //LWORK = 2*N_CPU*max(Vblksiz,64);
1936  WORK = (magmaDoubleComplex *) malloc (LWORK*sizeof(magmaDoubleComplex));
1937 
1938 
1939  nbchunk = plasma_ceildiv(N_CPU,colpercore);
1940 
1941  mycoresnb = cores_num;
1942  maxrequiredcores = nbchunk;
1943  if(maxrequiredcores<1)maxrequiredcores=1;
1944  if(mycoresnb > maxrequiredcores)
1945  {
1946  if(my_core_id==0)printf("==================================================================================\n");
1947  if(my_core_id==0)printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n", (int) maxrequiredcores);
1948  if(my_core_id==0)printf("==================================================================================\n");
1949  mycoresnb = maxrequiredcores;
1950  }
1951 
1952 
1953  /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
1954  * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning
1955  * that q_i+1 overlap a portion of the E(row_i, :).
1956  * IN parallel E is splitten in vertical block over the threads */
1957  /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
1958  * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning
1959  * that q_i+1 overlap a portion of the E(:, col_i).
1960  * IN parallel E is splitten in horizontal block over the threads */
1961 
1962  /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
1963  * So we use the reverse order from small q to large one,
1964  * so from q_n to q_1 so Left update to Identity.
1965  * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
1966  * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
1967  * */
1968  if(WANTZ==1)
1969  {
1970  versionL=113;
1971  SIDE='L';
1972  printf("\n\n\nWARNING THIS OPTION CUOLD NOT RUN ON BOTH GPU AND CPU. PROG WILL EXIT\n\n\n");
1973  }
1974 
1975  //printf(" ENTERING FUNCTION APPLY Q_v115: same as 113(L) or 114(L) or 93(R)"\n);
1976  if(my_core_id==0) {
1977  printf(" APPLY Q_v1 parallel with threads %d nbchunk %d colpercore %d N %d N_CPU %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",
1978  (int) cores_num, (int) nbchunk, (int) colpercore, (int) N, (int) N_CPU, (int) NB, (int) Vblksiz, (int) SIDE, (int) versionL, (int) versionR, (int) WANTZ);
1979  }
1980 
1981  for (chunkid = 0; chunkid<nbchunk; chunkid++)
1982  {
1983  coreid = chunkid%mycoresnb;
1984  corest = chunkid*colpercore;
1985  corelen = min(colpercore, (N_CPU-(chunkid*colpercore)));
1986 
1987  if(my_core_id==coreid)
1988  {
1989  //printf("mycore id %d voici nbchunk %d chunkid %d coreid %d, corest %d, corelen %d\n",my_core_id,nbchunk, chunkid, coreid, corest, corelen);
1990  if(SIDE=='L'){
1991  for (bg = nbGblk; bg>0; bg--)
1992  {
1993  firstcolj = (bg-1)*Vblksiz + 1;
1994  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
1995  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
1996  for (m = rownbm; m>0; m--)
1997  {
1998  vlen = 0;
1999  vnb = 0;
2000  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
2001  fst = (rownbm -m)*NB+colj +1;
2002  for (k=0; k<Vblksiz; k++)
2003  {
2004  colj = (bg-1)*Vblksiz + k;
2005  st = (rownbm -m)*NB+colj +1;
2006  ed = min(st+NB-1,N-1);
2007  if(st>ed)break;
2008  if((st==ed)&&(colj!=N-2))break;
2009  vlen=ed-fst+1;
2010  vnb=k+1;
2011  }
2012  colst = (bg-1)*Vblksiz;
2013  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
2014  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
2015 
2016  if(LOGQ) core_event_startblg(my_core_id);
2017  if((vlen>0)&&(vnb>0)){
2018  //lapackf77_zungqr( "L", "N", &vlen, &corelen, &vnb, V(vpos), &LDV, TAU(taupos), E(fst,corest), &LDE, WORK, &LWORK, &INFO );
2019  //ZUNMQR_BLG( "L", "N", &vlen, &corelen, &vnb, &NB, V(vpos), &LDV, TAU(taupos), E(fst,corest), &LDE, WORK, &LWORK, &INFO );
2020  if(WANTZ==1){
2021  st = max(colst,corest);
2022  len = corelen - (st-corest);
2023  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &len, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(fst,st), &LDE, WORK, &len);
2024  }else{
2025  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &corelen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(fst,corest), &LDE, WORK, &corelen); // corelen for WORK was colpercore before
2026  }
2027  }
2028  if(INFO!=0)
2029  printf("ERROR ZUNMQR INFO %d \n", (int) INFO);
2030  if(LOGQ) {
2031  core_event_endblg(my_core_id);
2032  core_log_eventblg(0xff0000, my_core_id);
2033  }
2034  }
2035  }
2036  }else if (SIDE=='R'){
2037  if(versionR==91){
2038  for (bg =1; bg<=nbGblk; bg++)
2039  {
2040  firstcolj = (bg-1)*Vblksiz + 1;
2041  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
2042  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
2043  for (m = 1; m<=rownbm; m++)
2044  {
2045  vlen = 0;
2046  vnb = 0;
2047  // for k=0;I compute the fst and then can remove it from the loop
2048  colj = (bg-1)*Vblksiz;
2049  fst = (rownbm -m)*NB+colj +1;
2050  for (k=0; k<Vblksiz; k++)
2051  {
2052  colj = (bg-1)*Vblksiz + k;
2053  st = (rownbm -m)*NB+colj +1;
2054  ed = min(st+NB-1,N-1);
2055  if(st>ed)break;
2056  if((st==ed)&&(colj!=N-2))break;
2057  vlen=ed-fst+1;
2058  vnb=k+1;
2059  }
2060  colj = (bg-1)*Vblksiz;
2061  findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
2062  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colj,vpos,taupos);
2063  if((vlen>0)&&(vnb>0))
2064  lapackf77_zlarfb( "R", "N", "F", "C", &corelen, &vlen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(corest,fst), &LDE, WORK, &corelen);
2065  }
2066  }
2067  }else if(versionR==92){
2068  rownbm = plasma_ceildiv((N-1),NB);
2069  for (m = 1; m<=rownbm; m++)
2070  {
2071  ncolinvolvd = min(N-1, m*NB);
2072  avai_blksiz=min(Vblksiz,ncolinvolvd);
2073  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
2074  for (n = 1; n<=nbgr; n++)
2075  {
2076  vlen = 0;
2077  vnb = 0;
2078  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
2079  colst = (n-1)*avai_blksiz;
2080  coled = colst + cur_blksiz -1;
2081  fst = (rownbm -m)*NB+colst +1;
2082  for (colj=colst; colj<=coled; colj++)
2083  {
2084  st = (rownbm -m)*NB+colj +1;
2085  ed = min(st+NB-1,N-1);
2086  if(st>ed)break;
2087  if((st==ed)&&(colj!=N-2))break;
2088  vlen=ed-fst+1;
2089  vnb=vnb+1;
2090  }
2091  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
2092  if((vlen>0)&&(vnb>0)){
2093  lapackf77_zlarfb( "R", "N", "F", "C", &corelen, &vlen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(corest,fst), &LDE, WORK, &corelen);
2094  }
2095  }
2096  }
2097  }
2098  }else{
2099  printf("ERROR SIDE %d \n",SIDE);
2100  }
2101  } // END my_core_id=coreid
2102  } // END loop over the chunk
2103 
2104 
2105 }
#define min(a, b)
Definition: common_magma.h:86
#define core_event_startblg(my_core_id)
magmaFloatComplex * TAU
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define lapackf77_zlarfb
Definition: magma_zlapack.h:78
struct gbstrct_blg core_in_all
magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b)
int magma_int_t
Definition: magmablas.h:12
#define core_log_eventblg(event, my_core_id)
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
magmaFloatComplex * E_CPU
#define TAU(m)
#define core_event_endblg(my_core_id)
magmaFloatComplex * V
magmaFloatComplex * T
#define LOGQ
#define E(m, n)
#define max(a, b)
Definition: common_magma.h:82
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

void tile_bulge_applyQ_parallel2 ( int  my_core_id)
static

Definition at line 2120 of file zhetrd_bhe2trc_v3.cpp.

References core_event_endblg, core_event_startblg, core_in_all, core_log_eventblg, gbstrct_blg::cores_num, E, gbstrct_blg::E, findVTpos(), lapackf77_zlarfb, gbstrct_blg::LDE, LOGQ, max, min, gbstrct_blg::N, gbstrct_blg::NB, plasma_ceildiv(), gbstrct_blg::SIDE, T, gbstrct_blg::T, TAU, gbstrct_blg::TAU, V, gbstrct_blg::V, gbstrct_blg::Vblksiz, and gbstrct_blg::WANTZ.

2121 {
2122  magma_int_t INFO;
2123  magma_int_t cores_num = core_in_all.cores_num;
2124  magmaDoubleComplex *T = core_in_all.T;
2125  magmaDoubleComplex *E = core_in_all.E;
2126  magmaDoubleComplex *V = core_in_all.V;
2127  magmaDoubleComplex *TAU = core_in_all.TAU;
2129  magma_int_t NB = core_in_all.NB;
2130  magma_int_t Vblksiz = core_in_all.Vblksiz;
2131  magma_int_t LDE = core_in_all.LDE;
2132  char SIDE = core_in_all.SIDE;
2133  magma_int_t WANTZ = core_in_all.WANTZ;
2134 
2135  //%===========================
2136  //% local variables
2137  //%===========================
2138  magma_int_t LDT,LDV,blklen,firstcolj;
2139  magma_int_t bg, nbGblk,rownbm, k, m, n;
2140  magma_int_t st,ed,fst,vlen,vnb,colj,len;
2141  magma_int_t blkid, vpos,taupos,tpos;
2142  magmaDoubleComplex *WORK;
2143  magma_int_t LWORK;
2144  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
2145  magma_int_t nbgr, colst, coled, versionL;
2146  magma_int_t chunkid,nbchunk,colpercore,corest,corelen;
2147  magma_int_t coreid,mycoresnb,maxrequiredcores;
2148 
2149  INFO=0;
2150  versionL = 113;
2151  LDT = Vblksiz;
2152  LDV = NB+Vblksiz-1;
2153  blklen = LDV*Vblksiz;
2154  nbGblk = plasma_ceildiv((N-1),Vblksiz);
2155  //LWORK = 2*N*max(Vblksiz,64);
2156  //WORK = (magmaDoubleComplex *) malloc (LWORK*sizeof(magmaDoubleComplex));
2157 
2158 
2159  /* use colpercore = N/cores_num; :if i want to split E into
2160  * cores_num chunk so I will have large chunk for each core.
2161  * However I prefer to split E into chunk of small size where
2162  * I guarantee that blas3 occur and the size of chunk remain into
2163  * cache, I think it is better. than I will have different chunk of
2164  * small sizeper coreand i will do each chunk till the end and
2165  * then move tothe second one for data locality
2166  *
2167  * version v1: for each chunck it apply all the V's then move to
2168  * the other chunck. the locality here inside each
2169  * chunck meaning that thread t apply V_k then move
2170  * to V_k+1 which overlap with V_k meaning that the
2171  * E_k+1 overlap with E_k. so here is the
2172  * locality however thread t had to read V_k+1 and
2173  * T_k+1 at each apply. note that all thread if they
2174  * run at same speed they might reading the same V_k
2175  * and T_k at the same time.
2176  *
2177  * version v2: for each V_k and T_k thread t apply those V_k and
2178  * T_k to E_k for all its chunck, then move to V_k+1
2179  * T_k+1 and apply them to E_k+1 on all the chunck,
2180  * and so on. the difference is that, thread keep V_k
2181  * and T_K while move over the E_k.
2182  * both version look like similar in perf.
2183  * */
2184  colpercore = min(NB,120); //colpercore = N make the code sequential running on thread=0;
2185  //colpercore = N/cores_num;
2186 
2187  colpercore = plasma_ceildiv(N,cores_num);
2188  if(colpercore>1000)
2189  colpercore = plasma_ceildiv(colpercore,10);
2190  else if(colpercore>800)
2191  colpercore = plasma_ceildiv(colpercore,8);
2192  else if(colpercore>600)
2193  colpercore = plasma_ceildiv(colpercore,6);
2194  else if(colpercore>400)
2195  colpercore = plasma_ceildiv(colpercore,4);
2196  else if(colpercore>200)
2197  colpercore = plasma_ceildiv(colpercore,2);
2198  if(colpercore>200)colpercore=120;
2199 
2200 
2201 
2202 
2203 
2204  LWORK = 2*colpercore*max(Vblksiz,64);
2205  //LWORK = 2*N*max(Vblksiz,64);
2206  WORK = (magmaDoubleComplex *) malloc (LWORK*sizeof(magmaDoubleComplex));
2207 
2208 
2209  nbchunk = plasma_ceildiv(N,colpercore);
2210 
2211  mycoresnb = cores_num;
2212  maxrequiredcores = nbchunk;
2213  if(maxrequiredcores<1)maxrequiredcores=1;
2214  if(mycoresnb > maxrequiredcores)
2215  {
2216  if(my_core_id==0)printf("==================================================================================\n");
2217  if(my_core_id==0)printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n", (int) maxrequiredcores);
2218  if(my_core_id==0)printf("==================================================================================\n");
2219  mycoresnb = maxrequiredcores;
2220  }
2221 
2222  /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
2223  * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
2224  /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
2225  * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */
2226 
2227  /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
2228  * So we use the reverse order from small q to large one,
2229  * so from q_n to q_1 so Left update to Identity.
2230  * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
2231  * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
2232  * */
2233  if(WANTZ==1)
2234  {
2235  versionL=113;
2236  SIDE='L';
2237  }
2238 
2239  //printf(" ENTERING FUNCTION APPLY Q_v115: same as 113(L) or 114(L) or 93(R)"\n);
2240  if(my_core_id==0) {
2241  printf(" APPLY Q_v2 parallel with threads %d nbchunk %d colpercore %d N %d NB %d Vblksiz %d SIDE %c versionL %d \n",
2242  (int) cores_num, (int) nbchunk, (int) colpercore, (int) N, (int) NB, (int) Vblksiz, (int) SIDE, (int) versionL);
2243  }
2244 
2245  //printf("mycore id %d voici nbchunk %d chunkid %d coreid %d, corest %d, corelen %d\n",my_core_id,nbchunk, chunkid, coreid, corest, corelen);
2246  if(SIDE=='L'){
2247  if(versionL==113){
2248  for (bg = nbGblk; bg>0; bg--)
2249  {
2250  firstcolj = (bg-1)*Vblksiz + 1;
2251  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
2252  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
2253  for (m = rownbm; m>0; m--)
2254  {
2255  vlen = 0;
2256  vnb = 0;
2257  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
2258  fst = (rownbm -m)*NB+colj +1;
2259  for (k=0; k<Vblksiz; k++)
2260  {
2261  colj = (bg-1)*Vblksiz + k;
2262  st = (rownbm -m)*NB+colj +1;
2263  ed = min(st+NB-1,N-1);
2264  if(st>ed)break;
2265  if((st==ed)&&(colj!=N-2))break;
2266  vlen=ed-fst+1;
2267  vnb=k+1;
2268  }
2269  colst = (bg-1)*Vblksiz;
2270  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
2271  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
2272 
2273  if((vlen>0)&&(vnb>0)){
2274  if(LOGQ) core_event_startblg(my_core_id);
2275  for (chunkid = 0; chunkid<nbchunk; chunkid++)
2276  {
2277  coreid = chunkid%mycoresnb;
2278  corest = chunkid*colpercore;
2279  corelen = min(colpercore, (N-(chunkid*colpercore)));
2280  if(my_core_id==coreid)
2281  {
2282  if(WANTZ==1){
2283  st = max(colst,corest);
2284  len = corelen - (st-corest);
2285  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &len, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(fst,st), &LDE, WORK, &len);
2286  }else{
2287  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &corelen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(fst,corest), &LDE, WORK, &corelen); // corelen for WORK was colpercore before
2288  } // end WANTZ
2289  } // END my_core_id=coreid
2290  } // END loop over the chunk
2291  if(LOGQ) {
2292  core_event_endblg(my_core_id);
2293  core_log_eventblg(0xff0000, my_core_id);
2294  }
2295  } // end if vlen and vnb
2296  } // end for m:rowmnb
2297  } // end for bg
2298  }else if(versionL==114){
2299  rownbm = plasma_ceildiv((N-1),NB);
2300  for (m = rownbm; m>0; m--)
2301  {
2302  ncolinvolvd = min(N-1, m*NB);
2303  avai_blksiz=min(Vblksiz,ncolinvolvd);
2304  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
2305  for (n = nbgr; n>0; n--)
2306  {
2307  vlen = 0;
2308  vnb = 0;
2309  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
2310  colst = (n-1)*avai_blksiz;
2311  coled = colst + cur_blksiz -1;
2312  fst = (rownbm -m)*NB+colst +1;
2313  for (colj=colst; colj<=coled; colj++)
2314  {
2315  st = (rownbm -m)*NB+colj +1;
2316  ed = min(st+NB-1,N-1);
2317  if(st>ed)break;
2318  if((st==ed)&&(colj!=N-2))break;
2319  vlen=ed-fst+1;
2320  vnb=vnb+1;
2321  }
2322  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
2323  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
2324  if((vlen>0)&&(vnb>0)){
2325  for (chunkid = 0; chunkid<nbchunk; chunkid++)
2326  {
2327  coreid = chunkid%mycoresnb;
2328  corest = chunkid*colpercore;
2329  corelen = min(colpercore, (N-(chunkid*colpercore)));
2330  if(my_core_id==coreid)
2331  {
2332  lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &corelen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(fst,corest), &LDE, WORK, &corelen);
2333  } // END my_core_id=coreid
2334  } // END loop over the chunk
2335  } // end if vlen and vnb
2336  } // end for n = nbgr
2337  } // end for m=rowmnb
2338  }
2339  }else if (SIDE=='R'){
2340  for (bg =1; bg<=nbGblk; bg++)
2341  {
2342  firstcolj = (bg-1)*Vblksiz + 1;
2343  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
2344  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
2345  for (m = 1; m<=rownbm; m++)
2346  {
2347  vlen = 0;
2348  vnb = 0;
2349  // for k=0;I compute the fst and then can remove it from the loop
2350  colj = (bg-1)*Vblksiz;
2351  fst = (rownbm -m)*NB+colj +1;
2352  for (k=0; k<Vblksiz; k++)
2353  {
2354  colj = (bg-1)*Vblksiz + k;
2355  st = (rownbm -m)*NB+colj +1;
2356  ed = min(st+NB-1,N-1);
2357  if(st>ed)break;
2358  if((st==ed)&&(colj!=N-2))break;
2359  vlen=ed-fst+1;
2360  vnb=k+1;
2361  }
2362  colj = (bg-1)*Vblksiz;
2363  findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
2364  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
2365  if((vlen>0)&&(vnb>0)){
2366  for (chunkid = 0; chunkid<nbchunk; chunkid++)
2367  {
2368  coreid = chunkid%mycoresnb;
2369  corest = chunkid*colpercore;
2370  corelen = min(colpercore, (N-(chunkid*colpercore)));
2371  if(my_core_id==coreid)
2372  {
2373  lapackf77_zlarfb( "R", "N", "F", "C", &corelen, &vlen, &vnb, V(vpos), &LDV, T(tpos), &LDT, E(corest,fst), &LDE, WORK, &corelen);
2374  } // END my_core_id=coreid
2375  } // END loop over the chunk
2376  } // end if vlen and vnb
2377  } // end for m=rowmnb
2378  } // end for bg
2379  }else{
2380  printf("ERROR SIDE %d \n",SIDE);
2381  }
2382 
2383 
2384 }
#define min(a, b)
Definition: common_magma.h:86
#define core_event_startblg(my_core_id)
magmaFloatComplex * TAU
#define T(m)
Definition: zgeqrf_mc.cpp:14
#define lapackf77_zlarfb
Definition: magma_zlapack.h:78
struct gbstrct_blg core_in_all
magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b)
int magma_int_t
Definition: magmablas.h:12
#define core_log_eventblg(event, my_core_id)
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define TAU(m)
magmaFloatComplex * E
#define core_event_endblg(my_core_id)
magmaFloatComplex * V
magmaFloatComplex * T
#define LOGQ
#define E(m, n)
#define max(a, b)
Definition: common_magma.h:82
#define V(m)

Here is the call graph for this function:

static void tile_bulge_computeT_parallel ( int  my_core_id)
static

Definition at line 1754 of file zhetrd_bhe2trc_v3.cpp.

References core_in_all, gbstrct_blg::cores_num, findVTpos(), findVTsiz(), lapackf77_zlarft, gbstrct_blg::locores_num, min, gbstrct_blg::N, gbstrct_blg::NB, plasma_ceildiv(), T, gbstrct_blg::T, TAU, gbstrct_blg::TAU, V, gbstrct_blg::V, and gbstrct_blg::Vblksiz.

1755 {
1756  magma_int_t INFO;
1757  /* CHANGE HERE to SWITCH FROM local_cores_num to cores_num
1758  * or do it at the startup by putting core_in_all.es_num=cores_num*/
1759  magma_int_t cores_num = core_in_all.locores_num;
1760  magmaDoubleComplex *T = core_in_all.T;
1761  magmaDoubleComplex *V = core_in_all.V;
1762  magmaDoubleComplex *TAU = core_in_all.TAU;
1764  magma_int_t NB = core_in_all.NB;
1765  magma_int_t Vblksiz = core_in_all.Vblksiz;
1766 
1767  //%===========================
1768  //% local variables
1769  //%===========================
1770  magma_int_t LDT, LDV,blklen,firstcolj;
1771  magma_int_t bg, nbGblk,rownbm, k, m, n;
1772  magma_int_t st,ed,fst,vlen,vnb,colj;
1773  magma_int_t blkid,vpos,taupos,tpos;
1774  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
1775  magma_int_t nbgr, colst, coled, version;
1776  magma_int_t blkpercore,blkcnt, myid;
1777 
1778 
1779  if(N<=0)
1780  return ;
1781 
1782  findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
1783  blkpercore = blkcnt/cores_num;
1784 
1785  LDT = Vblksiz;
1786  LDV = NB+Vblksiz-1;
1787  blklen = LDV*Vblksiz;
1788  nbGblk = plasma_ceildiv((N-1),Vblksiz);
1789 
1790  if(my_core_id==0) {
1791  printf(" COMPUTE T parallel threads %d with N %d NB %d Vblksiz %d \n",
1792  (int) cores_num, (int) N, (int) NB, (int) Vblksiz);
1793  }
1794 
1795  for (bg = nbGblk; bg>0; bg--)
1796  {
1797  firstcolj = (bg-1)*Vblksiz + 1;
1798  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
1799  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
1800  for (m = rownbm; m>0; m--)
1801  {
1802  vlen = 0;
1803  vnb = 0;
1804  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
1805  fst = (rownbm -m)*NB+colj +1;
1806  for (k=0; k<Vblksiz; k++)
1807  {
1808  colj = (bg-1)*Vblksiz + k;
1809  st = (rownbm -m)*NB+colj +1;
1810  ed = min(st+NB-1,N-1);
1811  if(st>ed)break;
1812  if((st==ed)&&(colj!=N-2))break;
1813  vlen=ed-fst+1;
1814  vnb=k+1;
1815  }
1816  colj = (bg-1)*Vblksiz;
1817  findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
1818  myid = blkid/blkpercore;
1819  if(my_core_id==(myid%cores_num)){
1820  if((vlen>0)&&(vnb>0))
1821  lapackf77_zlarft( "F", "C", &vlen, &vnb, V(vpos), &LDV, TAU(taupos), T(tpos), &LDT);
1822  }
1823  }
1824  }
1825 }
#define min(a, b)
Definition: common_magma.h:86
void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV)
magmaFloatComplex * TAU
#define lapackf77_zlarft
Definition: magma_zlapack.h:80
#define T(m)
Definition: zgeqrf_mc.cpp:14
struct gbstrct_blg core_in_all
magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b)
int magma_int_t
Definition: magmablas.h:12
void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
#define TAU(m)
magmaFloatComplex * V
magmaFloatComplex * T
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

static void tile_bulge_parallel ( int  my_core_id)
static

Definition at line 1527 of file zhetrd_bhe2trc_v3.cpp.

References A, gbstrct_blg::A, gbstrct_blg::BAND, core_event_endblg, core_event_startblg, core_in_all, core_log_eventblg, gbstrct_blg::cores_num, gbstrct_blg::grsiz, gbstrct_blg::LDA, gbstrct_blg::locores_num, LOG, magma_ztrdtype1cbHLsym_withQ(), magma_ztrdtype2cbHLsym_withQ(), magma_ztrdtype3cbHLsym_withQ(), min, gbstrct_blg::N, gbstrct_blg::NB, gbstrct_blg::NBTILES, gbstrct_blg::ss_prog, TAU, gbstrct_blg::TAU, V, gbstrct_blg::V, and gbstrct_blg::Vblksiz.

1528 {
1529  int core;
1530  int INFO;
1531  /* CHANGE HERE to SWITCH FROM local_cores_num to cores_num
1532  * or do it at the startup by putting core_in_all.es_num=cores_num*/
1533  int cores_num = core_in_all.locores_num;
1534  magmaDoubleComplex *A = core_in_all.A;
1535  magmaDoubleComplex *V = core_in_all.V;
1536  magmaDoubleComplex *TAU = core_in_all.TAU;
1537  int N = core_in_all.N;
1538  int NB = core_in_all.NB;
1539  int NBTILES = core_in_all.NBTILES;
1540  int LDA= core_in_all.LDA;
1541  int BAND= core_in_all.BAND;
1542  int grsiz= core_in_all.grsiz;
1543  int Vblksiz = core_in_all.Vblksiz;
1544  volatile int *prog = core_in_all.ss_prog;
1545  //%===========================
1546  //% local variables
1547  //%===========================
1548  int sweepid, myid, shift, stt, st, ed, stind, edind;
1549  int blklastind, colpt, algtype;
1550  int stepercol,mylastid,grnb,grid;
1551  int i,j,m,k;
1552  int thgrsiz, thgrnb, thgrid, thed;
1553  int coreinit,loopfirsttask,coreid;
1554  int colblktile,maxrequiredcores,colpercore,mycoresnb;
1555  int fin;
1556 
1557 
1558  if(N<=0)
1559  return ;
1560 
1561 
1562  //printf("=================> my core id %d of %d \n",my_core_id, cores_num);
1563 
1564  if((BAND!=0) && (BAND!=6) && (BAND!=62) && (BAND!=63)){
1565  if(my_core_id==0)printf(" ===============================================================================\n");
1566  if(my_core_id==0)printf(" ATTENTION ========> BAND is required to be 0 6 62 63, program will exit\n");
1567  if(my_core_id==0)printf(" ===============================================================================\n");
1568  return;
1569  }
1570  /* As I store V in the V vector there are overlap between
1571  * tasks so shift is now 4 where group need to be always
1572  * multiple of 2, because as example if grs=1 task 2 from
1573  * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
1574  * will overwrite the V of tasks 5 sweep 1 which are used by
1575  * task 6, so keep in mind that group need to be multiple of 2,
1576  * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
1577  * However, when storing V in A, shift could be back to 3.
1578  * */
1579 
1580  mycoresnb = cores_num;
1581  //grsiz = 2;
1582  shift = 5;
1583  if(grsiz==1)
1584  colblktile=1;
1585  else
1586  colblktile=grsiz/2;
1587 
1588  maxrequiredcores = NBTILES/colblktile;
1589  if(maxrequiredcores<1)maxrequiredcores=1;
1590  colpercore = colblktile*NB;
1591  if(mycoresnb > maxrequiredcores)
1592  {
1593  if(my_core_id==0)printf("==================================================================================\n");
1594  if(my_core_id==0)printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores);
1595  if(my_core_id==0)printf("==================================================================================\n");
1596  mycoresnb = maxrequiredcores;
1597  }
1598  thgrsiz = N;//mycoresnb;
1599 
1600  if(my_core_id==0) printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d BAND %4d\n",cores_num, N, NB, grsiz,thgrsiz,BAND);
1601 
1602 
1603 
1604 
1605 
1606  i = shift/grsiz;
1607  stepercol = i*grsiz == shift ? i:i+1;
1608 
1609  i = (N-1)/thgrsiz;
1610  thgrnb = i*thgrsiz == (N-1) ? i:i+1;
1611 
1612  for (thgrid = 1; thgrid<=thgrnb; thgrid++){
1613  stt = (thgrid-1)*thgrsiz+1;
1614  thed = min( (stt + thgrsiz -1), (N-1));
1615  for (i = stt; i <= N-1; i++){
1616  ed=min(i,thed);
1617  if(stt>ed)break;
1618  for (m = 1; m <=stepercol; m++){
1619  st=stt;
1620  for (sweepid = st; sweepid <=ed; sweepid++){
1621 
1622  for (k = 1; k <=grsiz; k++){
1623  myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
1624  if(myid%2 ==0){
1625  colpt = (myid/2)*NB+1+sweepid-1;
1626  stind = colpt-NB+1;
1627  edind = min(colpt,N);
1628  blklastind = colpt;
1629  if(stind>=edind){
1630  printf("ERROR---------> st>=ed %d %d \n\n",stind, edind);
1631  exit(-10);
1632  }
1633  }else{
1634  colpt = ((myid+1)/2)*NB + 1 +sweepid -1 ;
1635  stind = colpt-NB+1;
1636  edind = min(colpt,N);
1637  if( (stind>=edind-1) && (edind==N) )
1638  blklastind=N;
1639  else
1640  blklastind=0;
1641  if(stind>edind){
1642  printf("ERROR---------> st>=ed %d %d \n\n",stind, edind);
1643  exit(-10);
1644  }
1645  }
1646 
1647  coreid = (stind/colpercore)%mycoresnb;
1648 
1649 
1650  //printf(" current col %3d sweep %3d myid %3d coreid %7d my_core_id %3d ---------------------- st %2d ed %2d \n",i,sweepid, myid, coreid,my_core_id, stind, edind);
1651  //printf("MYID %2d prog %3d %3d %3d %3d %3d %3d %3d \n",my_core_id,prog[0],prog[1],prog[2],prog[3],prog[4],prog[5],prog[6]);
1652 
1653  if(my_core_id==coreid)
1654  {
1655  //printf("--> current col %3d sweep %3d myid %3d my_core_id %3d prog[myid-1] %3d prog[myid+shiftd] %3d\n",i,sweepid, myid,my_core_id,prog[myid-1], prog[myid+shift]);
1656  //__sync_synchronize();
1657  //fflush(stdout);
1658 
1659 
1660  fin=0;
1661  while(fin==0)
1662  {
1663  if(myid==1)
1664  {
1665  if( (prog[myid+shift-1]== (sweepid-1)) )
1666  {
1667 
1668  if(LOG) core_event_startblg(my_core_id);
1669  /*
1670  if(BAND==0)
1671  TRD_type1cHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);
1672  else if(BAND==6)*/
1673  magma_ztrdtype1cbHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);/*
1674  else if(BAND==62)
1675  TRD_hbcelr_v62sym_withQ(N, NB, A, LDA, V, TAU, stind, edind, myid, sweepid, Vblksiz);*/
1676  if(LOG) {
1677  core_event_endblg(my_core_id);
1678  core_log_eventblg(0x006680, my_core_id);
1679  }
1680 
1681  fin=1;
1682  prog[myid]= sweepid;
1683  if(blklastind >= (N-1))
1684  {
1685  for (j = 1; j <= shift; j++)
1686  prog[myid+j]=sweepid;
1687  }
1688  } // END progress condition
1689  }else{
1690  if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) )
1691  {
1692 
1693  if(LOG) core_event_startblg(my_core_id);
1694  /*
1695  if(BAND==0){
1696  if(myid%2 == 0)
1697  TRD_type2cHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);
1698  else
1699  TRD_type3cHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);
1700  }else if(BAND==6){*/
1701  if(myid%2 == 0)
1702  magma_ztrdtype2cbHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);
1703  else
1704  magma_ztrdtype3cbHLsym_withQ(N, NB, A, LDA, V, TAU, stind, edind, sweepid, Vblksiz);/*
1705  }else if(BAND==62){
1706  TRD_hbcelr_v62sym_withQ(N, NB, A, LDA, V, TAU, stind, edind, myid, sweepid, Vblksiz);
1707  }*/
1708  if(LOG) {
1709  core_event_endblg(my_core_id);
1710  core_log_eventblg(0xff0000, my_core_id);
1711  }
1712 
1713  fin=1;
1714  prog[myid]= sweepid;
1715  if(blklastind >= (N-1))
1716  {
1717  for (j = 1; j <= shift+mycoresnb; j++)
1718  prog[myid+j]=sweepid;
1719  }
1720  } // END progress condition
1721  } // END if myid==1
1722  } // END while loop
1723 
1724  } // END if my_core_id==coreid
1725 
1726  if(blklastind >= (N-1))
1727  {
1728  stt=stt+1;
1729  break;
1730  }
1731  } // END for k=1:grsiz
1732  } // END for sweepid=st:ed
1733  } // END for m=1:stepercol
1734  } // END for i=1:N-1
1735 //barrier(my_core_id, cores_num);
1736  } // END for thgrid=1:thgrnb
1737 
1738 
1739 
1740 
1741 // momentary solution for complex version when A(N,N-1) is complex and to avoid making it real out of this function
1742 // which will require to multiply the col/row of the Eigenvectors or Q by the scalar that make A(N,N-1) to real.
1743 // barrier(my_core_id, cores_num);
1744 // if(my_core_id == 0)magma_ztrdtype1cbHLsym_withQ(N, NB, A, LDA, V, TAU, N, N, N-1, Vblksiz);
1745 
1746 } // END FUNCTION
void magma_ztrdtype1cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
#define min(a, b)
Definition: common_magma.h:86
#define core_event_startblg(my_core_id)
magmaFloatComplex * TAU
void magma_ztrdtype3cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
struct gbstrct_blg core_in_all
magmaFloatComplex * A
#define core_log_eventblg(event, my_core_id)
void magma_ztrdtype2cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
volatile int * ss_prog
#define TAU(m)
#define LOG
#define core_event_endblg(my_core_id)
magmaFloatComplex * V
#define A(i, j)
Definition: cprint.cpp:16
#define V(m)

Here is the call graph for this function:

Here is the caller graph for this function:

void zcheck_eig_ ( char *  JOBZ,
magma_int_t MATYPE,
magma_int_t N,
magma_int_t NB,
magmaDoubleComplex *  A,
magma_int_t LDA,
double *  AD,
double *  AE,
double *  D1,
double *  EIG,
magmaDoubleComplex *  Z,
magma_int_t LDZ,
magmaDoubleComplex *  WORK,
double *  RWORK,
double *  RESU 
)

Here is the caller graph for this function:

Variable Documentation

volatile int barrier_in[MAX_THREADS_BLG]

Definition at line 81 of file zhetrd_bhe2trc_v3.cpp.

volatile int barrier_out[MAX_THREADS_BLG]

Definition at line 82 of file zhetrd_bhe2trc_v3.cpp.

struct gbstrct_blg core_in_all

Definition at line 85 of file zhetrd_bhe2trc_v3.cpp.

int log_eventsblg

Definition at line 90 of file zhetrd_bhe2trc_v3.cpp.

volatile int* ss_prog

Definition at line 83 of file zhetrd_bhe2trc_v3.cpp.