PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pdsbrdt.c
Go to the documentation of this file.
1 
17 #include "common.h"
18 #include <lapacke.h>
19 
20 #undef COMPLEX
21 #define REAL
22 
23 #define DEP(m) &(DEP[m])
24 #define A(_m, _n) (double *)plasma_geteltaddr(&A, (_m), (_n), eltsize)
25 /***************************************************************************/
29  PLASMA_desc A, double *D, double *E, PLASMA_desc T,
30  PLASMA_sequence *sequence, PLASMA_request *request)
31 {
34 
35 #ifdef COMPLEX
36  static double zone = (double) 1.0;
37  static double dzero = (double) 0.0;
38  double ztmp;
39  double absztmp;
40 #endif
41 
42  double *C, *S;
43  int N, NB, INgrsiz, INthgrsiz, BAND;
44  int myid, grsiz, shift=3, stt, st, ed, stind, edind;
45  int blklastind, colpt, PCOL, ACOL, MCOL;
46  int stepercol, mylastid, grnb, grid;
47  int *DEP,*MAXID;
48  int i, j, m;
49  int thgrsiz, thgrnb, thgrid, thed;
50  size_t eltsize = plasma_element_size(A.dtyp);
51 
52  plasma = plasma_context_self();
53  if (sequence->status != PLASMA_SUCCESS)
54  return;
55 
56  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
57 
58  N = A.m;
59  NB = A.mb;
60 
61  /* Quick return */
62  if (N == 0){
63  return;
64  }
65 
66  if (NB == 0) {
67  memset(D, 0, N*sizeof(double));
68  memset(E, 0, (N-1)*sizeof(double));
69 #ifdef COMPLEX
70  for (i=0; i<N; i++)
71  D[i] = fabs(*A(i,i));
72 #else
73  for (i=0; i<N; i++)
74  D[i] = *A(i,i);
75 #endif
76  return;
77  }
78 
79  /*
80  * Barrier is used because the bulge have to wait until
81  * the reduction to band has been finish.
82  * otherwise, I can remove this BARRIER when I integrate
83  * the function dependencies link inside the reduction to
84  * band. Keep in min the case when NB=1, where no bulge-chasing.
85  */
86  /***************************************************************/
87  QUARK_Barrier(plasma->quark);
88  /***************************************************************/
89 
90  /*
91  * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
92  * Make diagonal and superdiagonal elements real, storing them in
93  * D and E. if PlasmaLower, first transform lower bidiagonal form
94  * to upper bidiagonal by applying plane rotations/ Householder
95  * from the left, overwriting superdiagonal elements then make
96  * elements real of the resulting upper Bidiagonal. if PlasmaUpper
97  * then make its elements real. For Q, PT: ZSCAL should be done
98  * in case of WANTQ.
99  */
100  if (NB == 1){
101  memset(D, 0, N *sizeof(double));
102  memset(E, 0, (N-1)*sizeof(double));
103 #ifdef COMPLEX
104  if(uplo==PlasmaLower){
105  for (i=0; i<N; i++)
106  {
107  D[i] = ( *A(i, i) ); /* diag value */
108  if( i < (N-1)) { /* lower off-diag value */
109  ztmp = *A((i+1),i);
110  absztmp = fabs(ztmp);
111  *A((i+1),i) = absztmp;
112  E[i] = absztmp;
113  if(absztmp != dzero)
114  ztmp = (double) (ztmp / absztmp);
115  else
116  ztmp = zone;
117  if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
118  /* for Q: ZSCAL should be done in case of WANTQ */
119  }
120  }
121  } else { /* PlasmaUpper */
122  for (i=0; i<N; i++)
123  {
124  D[i] = ( *A(i,i) ); /* diag value*/
125  if(i<(N-1)) { /* lower off-diag value */
126  ztmp = *A(i, (i+1));
127  absztmp = fabs(ztmp);
128  *A(i,(i+1)) = absztmp;
129  E[i] = absztmp;
130  if(absztmp != dzero)
131  ztmp = (double) (ztmp / absztmp);
132  else
133  ztmp = zone;
134  if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
135  /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
136  }
137  }
138  } /* end PlasmaUpper*/
139 #else
140  if( uplo == PlasmaLower ){
141  for (i=0; i < N-1; i++) {
142  D[i] = *A(i, i);
143  E[i] = *A(i+1, i);
144  }
145  D[i] = *A(i, i);
146  } else {
147  for (i=0; i < N-1; i++) {
148  D[i] = *A(i, i );
149  E[i] = *A(i, i+1);
150  }
151  D[i] = *A(i, i);
152  }
153 #endif
154  return;
155  }
156 
157  /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
158  if( N <= 0 ) /* this will be removed we don t need it. */
159  {
160  double *work, *TTau;
161  int info, ldwork = N*N;
162  work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
163  TTau = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble);
164 
165  info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
166  A(0,0), A.lm, D, E, TTau, work, ldwork);
167  plasma_shared_free(plasma, (void*) work);
168  plasma_shared_free(plasma, (void*) TTau);
169 
170  if( info == 0 )
171  sequence->status = PLASMA_SUCCESS;
172  else
173  plasma_sequence_flush(plasma->quark, sequence, request, info);
174  return;
175  }
176 
177  /* General case NB > 1 && N > NB */
178  DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger );
179  MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger );
180  C = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble);
181  S = (double *) plasma_shared_alloc(plasma, N, PlasmaRealDouble);
182  memset(MAXID,0,(N+1)*sizeof(int));
183 
184  /***************************************************************************
185  * START BULGE CHASING CODE
186  **************************************************************************/
187  /*
188  * Initialisation of local parameter. those parameter should be
189  * input or tuned parameter.
190  */
191  INgrsiz = 1;
192  if( NB > 160 ) {
193  INgrsiz = 2;
194  }
195  else if( NB > 100 ) {
196  if( N < 5000 )
197  INgrsiz = 2;
198  else
199  INgrsiz = 4;
200  } else {
201  INgrsiz = 6;
202  }
203  INthgrsiz = N;
204  BAND = 0;
205 
206  grsiz = INgrsiz;
207  thgrsiz = INthgrsiz;
208  if( grsiz == 0 ) grsiz = 6;
209  if( thgrsiz == 0 ) thgrsiz = N;
210 
211  i = shift/grsiz;
212  stepercol = i*grsiz == shift ? i:i+1;
213 
214  i = (N-2)/thgrsiz;
215  thgrnb = i*thgrsiz == (N-2) ? i:i+1;
216 
217  for (thgrid = 1; thgrid<=thgrnb; thgrid++){
218  stt = (thgrid-1)*thgrsiz+1;
219  thed = min( (stt + thgrsiz -1), (N-2));
220  for (i = stt; i <= N-2; i++){
221  ed=min(i,thed);
222  if(stt>ed)break;
223  for (m = 1; m <=stepercol; m++){
224  st=stt;
225  for (j = st; j <=ed; j++){
226  /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */
227  /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */
228  /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
229  myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
230  mylastid = myid+grsiz-1;
231  PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */
232  MAXID[j] = myid;
233  PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
234  grnb = PCOL/grsiz;
235  grid = grnb*grsiz == PCOL ? grnb:grnb+1;
236  PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
237  ACOL = myid-grsiz;
238  if(myid==1)ACOL=0;
239  MCOL = myid;
240 
242  plasma->quark, &task_flags,
243  uplo, N, NB,
244  &A, C, S, i, j, m, grsiz, BAND,
245  DEP(PCOL), DEP(ACOL), DEP(MCOL) );
246 
247  if(mylastid%2 ==0){
248  blklastind = (mylastid/2)*NB+1+j-1;
249  }else{
250  colpt = ((mylastid+1)/2)*NB + 1 +j -1 ;
251  stind = colpt-NB+1;
252  edind = min(colpt,N);
253  if( (stind>=edind-1) && (edind==N) )
254  blklastind=N;
255  else
256  blklastind=0;
257  }
258  if(blklastind >= (N-1)) stt=stt+1;
259  } /* END for j=st:ed */
260  } /* END for m=1:stepercol */
261  } /* END for i=1:MINMN-2 */
262  } /* END for thgrid=1:thgrnb */
263 
264  /*
265  * Barrier used only for now, to be sure that everything
266  * is done before copying the D and E and free workspace.
267  * this will be removed later when D and E are directly filled
268  * during the bulge process.
269  */
270  QUARK_Barrier(plasma->quark);
271 
272  plasma_shared_free(plasma, (void*) DEP);
273  plasma_shared_free(plasma, (void*) MAXID);
274  plasma_shared_free(plasma, (void*) C);
275  plasma_shared_free(plasma, (void*) S);
276 
277  /*
278  * STORE THE RESULTING diagonal/off-diagonal in D AND E
279  */
280  memset(D, 0, N *sizeof(double));
281  memset(E, 0, (N-1)*sizeof(double));
282  /* Make diagonal and superdiagonal elements real,
283  * storing them in D and E
284  */
285  /* In complex case, the off diagonal element are
286  * not necessary real. we have to make off-diagonal
287  * elements real and copy them to E.
288  * When using HouseHolder elimination,
289  * the ZLARFG give us a real as output so, all the
290  * diagonal/off-diagonal element except the last one are already
291  * real and thus we need only to take the abs of the last
292  * one.
293  * */
294 #ifdef COMPLEX
295  if(uplo==PlasmaLower){
296  for (i=0; i < N-1 ; i++)
297  {
298  D[i] = ( *A(i,i) );
299  /*
300  * Alternative for Householder case, all off-diag
301  * are real except the last off-diag, where we
302  * have to take the abs
303  */
304  if(i<(N-2))
305  E[i] = (*A(i+1, i));
306  else
307  E[i] = fabs( *A(i+1, i));
308  }
309  D[i] = ( *A(i, i) );
310  } else { /* PlasmaUpper */
311  for (i=0; i<N-1; i++)
312  {
313  D[i] = ( *A(i,i) );
314  /*
315  * Alternative for Householder case, all off-diag
316  * are real except the last off-diag, where we
317  * have to take the abs
318  */
319  if( i < (N-2) )
320  E[i] = (*A(i, (i+1)));
321  else
322  E[i] = fabs(*A(i, (i+1)));
323  }
324  D[i] = ( *A(i, i) );
325  } /* end PlasmaUpper */
326 #else
327  if( uplo == PlasmaLower ){
328  for (i=0; i < N-1; i++) {
329  D[i] = *A(i, i);
330  E[i] = *A(i+1, i);
331  }
332  D[i] = *A(i, i);
333  } else {
334  for (i=0; i < N-1; i++) {
335  D[i] = *A(i, i );
336  E[i] = *A(i, i+1);
337  }
338  D[i] = *A(i, i);
339  }
340 #endif
341 
342 } /* END FUNCTION */