MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
slaex3_m.cpp File Reference
#include "common_magma.h"
#include <cblas.h>
Include dependency graph for slaex3_m.cpp:

Go to the source code of this file.

Macros

#define Q(ix, iy)   (q + (ix) + ldq * (iy))
 
#define dQ2(id)   (dwork[id])
 
#define dS(id, ii)   (dwork[id] + n2*n2_loc +(ii)* (n2*nb))
 
#define dQ(id, ii)   (dwork[id] + n2*n2_loc + 2 * (n2*nb) +(ii)* (n2_loc*nb))
 

Functions

magma_int_t magma_get_slaex3_m_k ()
 
magma_int_t magma_get_slaex3_m_nb ()
 
void magma_svrange (magma_int_t k, float *d, magma_int_t *il, magma_int_t *iu, float vl, float vu)
 
void magma_sirange (magma_int_t k, magma_int_t *indxq, magma_int_t *iil, magma_int_t *iiu, magma_int_t il, magma_int_t iu)
 
magma_int_t magma_slaex3_m (magma_int_t nrgpu, magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *q, magma_int_t ldq, float rho, float *dlamda, float *q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, float **dwork, magma_queue_t stream[MagmaMaxGPUs][2], char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info)
 

Macro Definition Documentation

#define dQ (   id,
  ii 
)    (dwork[id] + n2*n2_loc + 2 * (n2*nb) +(ii)* (n2_loc*nb))

Definition at line 23 of file slaex3_m.cpp.

#define dQ2 (   id)    (dwork[id])

Definition at line 21 of file slaex3_m.cpp.

#define dS (   id,
  ii 
)    (dwork[id] + n2*n2_loc +(ii)* (n2*nb))

Definition at line 22 of file slaex3_m.cpp.

#define Q (   ix,
  iy 
)    (q + (ix) + ldq * (iy))

Definition at line 19 of file slaex3_m.cpp.

Function Documentation

magma_int_t magma_get_slaex3_m_k ( )

Definition at line 26 of file slaex3_m.cpp.

26 { return 512; }

Here is the caller graph for this function:

magma_int_t magma_get_slaex3_m_nb ( )

Definition at line 27 of file slaex3_m.cpp.

27 { return 1024; }

Here is the caller graph for this function:

void magma_sirange ( magma_int_t  k,
magma_int_t indxq,
magma_int_t iil,
magma_int_t iiu,
magma_int_t  il,
magma_int_t  iu 
)

Definition at line 42 of file slaex3.cpp.

43  {
44  magma_int_t i;
45 
46  *iil = 1;
47  *iiu = 0;
48  for (i = il; i<=iu; ++i)
49  if (indxq[i-1]<=k){
50  *iil = indxq[i-1];
51  break;
52  }
53  for (i = iu; i>=il; --i)
54  if (indxq[i-1]<=k){
55  *iiu = indxq[i-1];
56  break;
57  }
58  return;
59  }
int magma_int_t
Definition: magmablas.h:12

Here is the caller graph for this function:

magma_int_t magma_slaex3_m ( magma_int_t  nrgpu,
magma_int_t  k,
magma_int_t  n,
magma_int_t  n1,
float *  d,
float *  q,
magma_int_t  ldq,
float  rho,
float *  dlamda,
float *  q2,
magma_int_t indx,
magma_int_t ctot,
float *  w,
float *  s,
magma_int_t indxq,
float **  dwork,
magma_queue_t  stream[MagmaMaxGPUs][2],
char  range,
float  vl,
float  vu,
magma_int_t  il,
magma_int_t  iu,
magma_int_t info 
)

Definition at line 35 of file slaex3_m.cpp.

References __func__, blasf77_scopy(), blasf77_sgemm(), cblas_snrm2(), cpu_gpu_sdiff(), dQ, dQ2, dS, get_current_time(), GetTimerValue(), lapackf77_lsame, lapackf77_slacpy(), lapackf77_slaed4, lapackf77_slamc3, lapackf77_slamrg, lapackf77_slaset(), MAGMA_ERR_ILLEGAL_VALUE, magma_free_pinned, magma_get_slaex3_m_k(), magma_get_slaex3_m_nb(), magma_queue_sync, magma_setdevice(), magma_sgemm(), magma_sgetmatrix, magma_sirange(), magma_slaex3(), magma_smalloc_pinned(), magma_ssetmatrix_async, MAGMA_SUCCESS, magma_svrange(), magma_xerbla(), magmablasSetKernelStream(), MagmaMaxGPUs, MagmaNoTrans, max, min, and Q.

43 {
44 /*
45  Purpose
46  =======
47  SLAEX3 finds the roots of the secular equation, as defined by the
48  values in D, W, and RHO, between 1 and K. It makes the
49  appropriate calls to SLAED4 and then updates the eigenvectors by
50  multiplying the matrix of eigenvectors of the pair of eigensystems
51  being combined by the matrix of eigenvectors of the K-by-K system
52  which is solved here.
53 
54  It is used in the last step when only a part of the eigenvectors
55  is required.
56  It compute only the required part of the eigenvectors and the rest
57  is not used.
58 
59  This code makes very mild assumptions about floating point
60  arithmetic. It will work on machines with a guard digit in
61  add/subtract, or on those binary machines without guard digits
62  which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
63  It could conceivably fail on hexadecimal or decimal machines
64  without guard digits, but we know of none.
65 
66  Arguments
67  =========
68  K (input) INTEGER
69  The number of terms in the rational function to be solved by
70  SLAED4. K >= 0.
71 
72  N (input) INTEGER
73  The number of rows and columns in the Q matrix.
74  N >= K (deflation may result in N>K).
75 
76  N1 (input) INTEGER
77  The location of the last eigenvalue in the leading submatrix.
78  min(1,N) <= N1 <= N/2.
79 
80  D (output) DOUBLE PRECISION array, dimension (N)
81  D(I) contains the updated eigenvalues for
82  1 <= I <= K.
83 
84  Q (output) DOUBLE PRECISION array, dimension (LDQ,N)
85  Initially the first K columns are used as workspace.
86  On output the columns ??? to ??? contain
87  the updated eigenvectors.
88 
89  LDQ (input) INTEGER
90  The leading dimension of the array Q. LDQ >= max(1,N).
91 
92  RHO (input) DOUBLE PRECISION
93  The value of the parameter in the rank one update equation.
94  RHO >= 0 required.
95 
96  DLAMDA (input/output) DOUBLE PRECISION array, dimension (K)
97  The first K elements of this array contain the old roots
98  of the deflated updating problem. These are the poles
99  of the secular equation. May be changed on output by
100  having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
101  Cray-2, or Cray C-90, as described above.
102 
103  Q2 (input) DOUBLE PRECISION array, dimension (LDQ2, N)
104  The first K columns of this matrix contain the non-deflated
105  eigenvectors for the split problem.
106 
107  INDX (input) INTEGER array, dimension (N)
108  The permutation used to arrange the columns of the deflated
109  Q matrix into three groups (see SLAED2).
110  The rows of the eigenvectors found by SLAED4 must be likewise
111  permuted before the matrix multiply can take place.
112 
113  CTOT (input) INTEGER array, dimension (4)
114  A count of the total number of the various types of columns
115  in Q, as described in INDX. The fourth column type is any
116  column which has been deflated.
117 
118  W (input/output) DOUBLE PRECISION array, dimension (K)
119  The first K elements of this array contain the components
120  of the deflation-adjusted updating vector. Destroyed on
121  output.
122 
123  S (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K
124  Will contain the eigenvectors of the repaired matrix which
125  will be multiplied by the previously accumulated eigenvectors
126  to update the system.
127 
128  INDXQ (output) INTEGER array, dimension (N)
129  On exit, the permutation which will reintegrate the
130  subproblems back into sorted order,
131  i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.
132 
133  DWORK (devices workspaces) DOUBLE PRECISION array of arrays,
134  dimension NRGPU.
135  if NRGPU = 1 the dimension of the first workspace
136  should be (3*N*N/2+3*N)
137  otherwise the NRGPU workspaces should have the size
138  ceil((N-N1) * (N-N1) / floor(nrgpu/2)) +
139  NB * ((N-N1) + (N-N1) / floor(nrgpu/2))
140 
141  STREAM (device stream) magma_queue_t array,
142  dimension (MagmaMaxGPUs,2)
143 
144  INFO (output) INTEGER
145  = 0: successful exit.
146  < 0: if INFO = -i, the i-th argument had an illegal value.
147  > 0: if INFO = 1, an eigenvalue did not converge
148 
149  Further Details
150  ===============
151  Based on contributions by
152  Jeff Rutter, Computer Science Division, University of California
153  at Berkeley, USA
154  Modified by Francoise Tisseur, University of Tennessee.
155 
156  ===================================================================== */
157 
158  if (nrgpu==1){
159  magma_setdevice(0);
160  magma_slaex3(k, n, n1, d, q, ldq, rho,
161  dlamda, q2, indx, ctot, w, s, indxq,
162  *dwork, range, vl, vu, il, iu, info );
163  return MAGMA_SUCCESS;
164  }
165  float d_one = 1.;
166  float d_zero = 0.;
167  magma_int_t ione = 1;
168  magma_int_t ineg_one = -1;
169  char range_[] = {range, 0};
170 
171  magma_int_t iil, iiu, rk;
172  magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu;
173  magma_int_t ni_loc[MagmaMaxGPUs];
174 
175  magma_int_t i,ind,iq2,j,n12,n2,n23,tmp,lq2;
176  float temp;
177  magma_int_t alleig, valeig, indeig;
178 
179  alleig = lapackf77_lsame(range_, "A");
180  valeig = lapackf77_lsame(range_, "V");
181  indeig = lapackf77_lsame(range_, "I");
182 
183  *info = 0;
184 
185  if(k < 0)
186  *info=-1;
187  else if(n < k)
188  *info=-2;
189  else if(ldq < max(1,n))
190  *info=-6;
191  else if (! (alleig || valeig || indeig))
192  *info = -15;
193  else {
194  if (valeig) {
195  if (n > 0 && vu <= vl)
196  *info = -17;
197  }
198  else if (indeig) {
199  if (il < 1 || il > max(1,n))
200  *info = -18;
201  else if (iu < min(n,il) || iu > n)
202  *info = -19;
203  }
204  }
205 
206 
207  if(*info != 0){
208  magma_xerbla(__func__, -(*info));
210  }
211 
212  // Quick return if possible
213  if(k == 0)
214  return MAGMA_SUCCESS;
215  /*
216  Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
217  be computed with high relative accuracy (barring over/underflow).
218  This is a problem on machines without a guard digit in
219  add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
220  The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
221  which on any of these machines zeros out the bottommost
222  bit of DLAMDA(I) if it is 1; this makes the subsequent
223  subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
224  occurs. On binary machines with a guard digit (almost all
225  machines) it does not change DLAMDA(I) at all. On hexadecimal
226  and decimal machines with a guard digit, it slightly
227  changes the bottommost bits of DLAMDA(I). It does not account
228  for hexadecimal or decimal machines without guard digits
229  (we know of none). We use a subroutine call to compute
230  2*DLAMBDA(I) to prevent optimizing compilers from eliminating
231  this code.*/
232 
233 //#define CHECK_CPU
234 #ifdef CHECK_CPU
235  float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs];
236  #define hQ2(id) (hwQ2[id])
237  #define hS(id, ii) (hwS[ii][id])
238  #define hQ(id, ii) (hwQ[ii][id])
239 #endif
240  n2 = n - n1;
241 
242  n12 = ctot[0] + ctot[1];
243  n23 = ctot[1] + ctot[2];
244 
245  iq2 = n1 * n12;
246  lq2 = iq2 + n2 * n23;
247 
248  n1_loc = (n1-1) / (nrgpu/2) + 1;
249  n2_loc = (n2-1) / (nrgpu/2) + 1;
250 
251  nb = magma_get_slaex3_m_nb();
252 
253  if (n1 >= magma_get_slaex3_m_k()){
254 #ifdef CHECK_CPU
255  for (igpu = 0; igpu < nrgpu; ++igpu){
256  magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb );
257  magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb );
258  magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc );
259  magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb );
260  magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb );
261  }
262 #endif
263  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
264  ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc);
265 #ifdef CHECK_CPU
266  lapackf77_slacpy("A", &ni_loc[igpu], &n12, q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc);
267 #endif
268  magma_setdevice(igpu);
269  magma_ssetmatrix_async( ni_loc[igpu], n12,
270  q2+n1_loc*(igpu/2), n1,
271  dQ2(igpu), n1_loc, stream[igpu][0] );
272  ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc);
273 #ifdef CHECK_CPU
274  lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc);
275 #endif
276  magma_setdevice(igpu+1);
277  magma_ssetmatrix_async( ni_loc[igpu+1], n23,
278  q2+iq2+n2_loc*(igpu/2), n2,
279  dQ2(igpu+1), n2_loc, stream[igpu+1][0] );
280  }
281  }
282 
283  //
284 
285 #ifdef _OPENMP
286  //openmp implementation
289 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
290  magma_timestr_t start, end;
291  start = get_current_time();
292 #endif
293 
294 #pragma omp parallel private(i, j, tmp, temp)
295  {
296  magma_int_t id = omp_get_thread_num();
297  magma_int_t tot = omp_get_num_threads();
298 
299  magma_int_t ib = ( id * k) / tot; //start index of local loop
300  magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
301  magma_int_t ik = ie - ib; //number of local indices
302 
303  for(i = ib; i < ie; ++i)
304  dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];
305 
306  for(j = ib; j < ie; ++j){
307  magma_int_t tmpp=j+1;
308  magma_int_t iinfo = 0;
309  lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
310  // If the zero finder fails, the computation is terminated.
311  if(iinfo != 0){
312 #pragma omp critical (info)
313  *info=iinfo;
314  break;
315  }
316  }
317 
318 #pragma omp barrier
319 
320  if(*info == 0){
321 
322 #pragma omp single
323  {
324  //Prepare the INDXQ sorting permutation.
325  magma_int_t nk = n - k;
326  lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq);
327 
328  //compute the lower and upper bound of the non-deflated eigenvectors
329  if (valeig)
330  magma_svrange(k, d, &iil, &iiu, vl, vu);
331  else if (indeig)
332  magma_sirange(k, indxq, &iil, &iiu, il, iu);
333  else {
334  iil = 1;
335  iiu = k;
336  }
337  rk = iiu - iil + 1;
338  }
339 
340  if (k == 2){
341 #pragma omp single
342  {
343  for(j = 0; j < k; ++j){
344  w[0] = *Q(0,j);
345  w[1] = *Q(1,j);
346 
347  i = indx[0] - 1;
348  *Q(0,j) = w[i];
349  i = indx[1] - 1;
350  *Q(1,j) = w[i];
351  }
352  }
353 
354  }
355  else if(k != 1){
356 
357  // Compute updated W.
358  blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);
359 
360  // Initialize W(I) = Q(I,I)
361  tmp = ldq + 1;
362  blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);
363 
364  for(j = 0; j < k; ++j){
365  magma_int_t i_tmp = min(j, ie);
366  for(i = ib; i < i_tmp; ++i)
367  w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
368  i_tmp = max(j+1, ib);
369  for(i = i_tmp; i < ie; ++i)
370  w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
371  }
372 
373  for(i = ib; i < ie; ++i)
374  w[i] = copysign( sqrt( -w[i] ), s[i]);
375 
376 #pragma omp barrier
377 
378  //reduce the number of used threads to have enough S workspace
379  tot = min(n1, omp_get_num_threads());
380 
381  if(id < tot){
382  ib = ( id * rk) / tot + iil - 1;
383  ie = ((id+1) * rk) / tot + iil - 1;
384  ik = ie - ib;
385  }
386  else{
387  ib = -1;
388  ie = -1;
389  ik = -1;
390  }
391 
392  // Compute eigenvectors of the modified rank-1 modification.
393  for(j = ib; j < ie; ++j){
394  for(i = 0; i < k; ++i)
395  s[id*k + i] = w[i] / *Q(i,j);
396  temp = cblas_snrm2( k, s+id*k, 1);
397  for(i = 0; i < k; ++i){
398  magma_int_t iii = indx[i] - 1;
399  *Q(i,j) = s[id*k + iii] / temp;
400  }
401  }
402  }
403  }
404  }
405  if (*info != 0)
406  return MAGMA_SUCCESS; //??????
407 
408 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
409  end = get_current_time();
410  printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
411 #endif
412 
413 #else
414  // Non openmp implementation
417 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
418  magma_timestr_t start, end;
419  start = get_current_time();
420 #endif
421 
422  for(i = 0; i < k; ++i)
423  dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];
424 
425  for(j = 0; j < k; ++j){
426  magma_int_t tmpp=j+1;
427  magma_int_t iinfo = 0;
428  lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
429  // If the zero finder fails, the computation is terminated.
430  if(iinfo != 0)
431  *info=iinfo;
432  }
433  if(*info != 0)
434  return MAGMA_SUCCESS;
435 
436  //Prepare the INDXQ sorting permutation.
437  magma_int_t nk = n - k;
438  lapackf77_slamrg( &k, &nk, d, &ione , &ineg_one, indxq);
439 
440  //compute the lower and upper bound of the non-deflated eigenvectors
441  if (valeig)
442  magma_svrange(k, d, &iil, &iiu, vl, vu);
443  else if (indeig)
444  magma_sirange(k, indxq, &iil, &iiu, il, iu);
445  else {
446  iil = 1;
447  iiu = k;
448  }
449  rk = iiu - iil + 1;
450 
451  if (k == 2){
452 
453  for(j = 0; j < k; ++j){
454  w[0] = *Q(0,j);
455  w[1] = *Q(1,j);
456 
457  i = indx[0] - 1;
458  *Q(0,j) = w[i];
459  i = indx[1] - 1;
460  *Q(1,j) = w[i];
461  }
462 
463  }
464  else if(k != 1){
465 
466  // Compute updated W.
467  blasf77_scopy( &k, w, &ione, s, &ione);
468 
469  // Initialize W(I) = Q(I,I)
470  tmp = ldq + 1;
471  blasf77_scopy( &k, q, &tmp, w, &ione);
472 
473  for(j = 0; j < k; ++j){
474  for(i = 0; i < j; ++i)
475  w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
476  for(i = j+1; i < k; ++i)
477  w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
478  }
479 
480  for(i = 0; i < k; ++i)
481  w[i] = copysign( sqrt( -w[i] ), s[i]);
482 
483  // Compute eigenvectors of the modified rank-1 modification.
484  for(j = iil-1; j < iiu; ++j){
485  for(i = 0; i < k; ++i)
486  s[i] = w[i] / *Q(i,j);
487  temp = cblas_snrm2( k, s, 1);
488  for(i = 0; i < k; ++i){
489  magma_int_t iii = indx[i] - 1;
490  *Q(i,j) = s[iii] / temp;
491  }
492  }
493  }
494 
495 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
496  end = get_current_time();
497  printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
498 #endif
499 
500 #endif //_OPENMP
501 
502  // Compute the updated eigenvectors.
503 
504 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
505  start = get_current_time();
506 #endif
507 
508  if(rk > 0){
509  if (n1 < magma_get_slaex3_m_k()){
510  // stay on the CPU
511  if( n23 != 0 ){
512  lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
513  blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2,
514  s, &n23, &d_zero, Q(n1,iil-1), &ldq );
515  }
516  else
517  lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);
518 
519  if( n12 != 0 ) {
520  lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
521  blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1,
522  s, &n12, &d_zero, Q(0,iil-1), &ldq);
523  }
524  else
525  lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
526  }
527  else {
528  //use the gpus
529  ib = min(nb, rk);
530  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
531  if (n23 != 0) {
532  magma_setdevice(igpu+1);
533  magma_ssetmatrix_async( n23, ib,
534  Q(ctot[0],iil-1), ldq,
535  dS(igpu+1,0), n23, stream[igpu+1][0] );
536  }
537  if (n12 != 0) {
538  magma_setdevice(igpu);
539  magma_ssetmatrix_async( n12, ib,
540  Q(0,iil-1), ldq,
541  dS(igpu,0), n12, stream[igpu][0] );
542  }
543  }
544 
545  for (i = 0; i<rk; i+=nb){
546  ib = min(nb, rk - i);
547  ind = (i/nb)%2;
548  if (i+nb<rk){
549  ib2 = min(nb, rk - i - nb);
550  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
551  if (n23 != 0) {
552  magma_setdevice(igpu+1);
553  magma_ssetmatrix_async( n23, ib2,
554  Q(ctot[0],iil-1+i+nb), ldq,
555  dS(igpu+1,(ind+1)%2), n23, stream[igpu+1][(ind+1)%2] );
556  }
557  if (n12 != 0) {
558  magma_setdevice(igpu);
559  magma_ssetmatrix_async( n12, ib2,
560  Q(0,iil-1+i+nb), ldq,
561  dS(igpu,(ind+1)%2), n12, stream[igpu][(ind+1)%2] );
562  }
563  }
564  }
565 
566  // Ensure that the data is copied on gpu since we will overwrite it.
567  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
568  if (n23 != 0) {
569 #ifdef CHECK_CPU
570  lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23);
571 #endif
572  magma_setdevice(igpu+1);
573  magma_queue_sync( stream[igpu+1][ind] );
574  }
575  if (n12 != 0) {
576 #ifdef CHECK_CPU
577  lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12);
578 #endif
579  magma_setdevice(igpu);
580  magma_queue_sync( stream[igpu][ind] );
581  }
582 
583  }
584  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
585  if (n23 != 0) {
586 #ifdef CHECK_CPU
587  blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc,
588  hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc);
589 #endif
590  magma_setdevice(igpu+1);
591  magmablasSetKernelStream(stream[igpu+1][ind]);
592  magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc,
593  dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc);
594 #ifdef CHECK_CPU
595  printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc));
596 #endif
597  }
598  if (n12 != 0) {
599 #ifdef CHECK_CPU
600  blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc,
601  hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc);
602 #endif
603  magma_setdevice(igpu);
604  magmablasSetKernelStream(stream[igpu][ind]);
605  magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc,
606  dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc);
607 #ifdef CHECK_CPU
608  printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc));
609 #endif
610  }
611  }
612  for (igpu = 0; igpu < nrgpu-1; igpu += 2){
613  if (n23 != 0) {
614  magma_setdevice(igpu+1);
615  magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
616  Q(n1+n2_loc*(igpu/2),iil-1+i), ldq );
617 // magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
618 // Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, stream[igpu+1][ind] );
619  }
620  if (n12 != 0) {
621  magma_setdevice(igpu);
622  magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
623  Q(n1_loc*(igpu/2),iil-1+i), ldq );
624 // magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
625 // Q(n1_loc*(igpu/2),iil-1+i), ldq, stream[igpu][ind] );
626  }
627  }
628  }
629  for (igpu = 0; igpu < nrgpu; ++igpu){
630 #ifdef CHECK_CPU
631  magma_free_pinned( hwS[1][igpu] );
632  magma_free_pinned( hwS[0][igpu] );
633  magma_free_pinned( hwQ2[igpu] );
634  magma_free_pinned( hwQ[1][igpu] );
635  magma_free_pinned( hwQ[0][igpu] );
636 #endif
637  magma_setdevice(igpu);
639  magma_queue_sync( stream[igpu][0] );
640  magma_queue_sync( stream[igpu][1] );
641  }
642  if( n23 == 0 )
643  lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);
644 
645  if( n12 == 0 )
646  lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
647  }
648  }
649 #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
650  end = get_current_time();
651  printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.);
652 #endif
653 
654  return MAGMA_SUCCESS;
655 } /*magma_slaed3_m*/
#define MAGMA_ERR_ILLEGAL_VALUE
Definition: magma.h:107
#define min(a, b)
Definition: common_magma.h:86
magma_int_t magma_get_slaex3_m_k()
Definition: slaex3_m.cpp:26
void magma_sgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_const_ptr dB, magma_int_t lddb, float beta, magmaFloat_ptr dC, magma_int_t lddc)
void magma_sirange(magma_int_t k, magma_int_t *indxq, magma_int_t *iil, magma_int_t *iiu, magma_int_t il, magma_int_t iu)
Definition: slaex3.cpp:42
#define __func__
Definition: common_magma.h:65
#define Q(ix, iy)
Definition: slaex3_m.cpp:19
#define dQ(id, ii)
Definition: slaex3_m.cpp:23
float cblas_snrm2(const int N, const float *X, const int incX)
#define magma_free_pinned(ptr)
Definition: magma.h:60
void lapackf77_slacpy(const char *uplo, const magma_int_t *m, const magma_int_t *n, const float *A, const magma_int_t *lda, float *B, const magma_int_t *ldb)
int magma_int_t
Definition: magmablas.h:12
#define dwork(dev, i, j)
#define vl(i, j)
#define lapackf77_slamc3
Definition: magma_clapack.h:48
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
float cpu_gpu_sdiff(magma_int_t m, magma_int_t n, const float *hA, magma_int_t lda, magmaFloat_const_ptr dA, magma_int_t ldda)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
void magma_svrange(magma_int_t k, float *d, magma_int_t *il, magma_int_t *iu, float vl, float vu)
Definition: slaex3.cpp:25
#define lapackf77_slamrg
Definition: magma_clapack.h:49
#define magma_ssetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_s.h:711
void lapackf77_slaset(const char *uplo, const magma_int_t *m, const magma_int_t *n, const float *alpha, const float *beta, float *A, const magma_int_t *lda)
#define MAGMA_SUCCESS
Definition: magma.h:106
#define magma_sgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_s.h:705
#define lapackf77_slaed4
Definition: magma_clapack.h:47
void blasf77_sgemm(const char *transa, const char *transb, const magma_int_t *m, const magma_int_t *n, const magma_int_t *k, const float *alpha, const float *A, const magma_int_t *lda, const float *B, const magma_int_t *ldb, const float *beta, float *C, const magma_int_t *ldc)
#define dS(id, ii)
Definition: slaex3_m.cpp:22
#define dQ2(id)
Definition: slaex3_m.cpp:21
double GetTimerValue(magma_timestr_t start, magma_timestr_t end)
Definition: timer.cpp:94
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
magma_timestr_t get_current_time(void)
Definition: timer.cpp:76
void blasf77_scopy(const magma_int_t *n, const float *x, const magma_int_t *incx, float *y, const magma_int_t *incy)
static magma_err_t magma_smalloc_pinned(float **ptrPtr, size_t n)
Definition: magma.h:89
magma_int_t magma_get_slaex3_m_nb()
Definition: slaex3_m.cpp:27
#define magma_queue_sync(queue)
Definition: magma.h:119
magma_int_t magma_slaex3(magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *q, magma_int_t ldq, float rho, float *dlamda, float *q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, float *dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info)
Definition: slaex3.cpp:63

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_svrange ( magma_int_t  k,
float *  d,
magma_int_t il,
magma_int_t iu,
float  vl,
float  vu 
)

Definition at line 25 of file slaex3.cpp.

26  {
27  magma_int_t i;
28 
29  *il=1;
30  *iu=k;
31  for (i = 0; i < k; ++i){
32  if (d[i] > vu){
33  *iu = i;
34  break;
35  }
36  else if (d[i] < vl)
37  ++*il;
38  }
39  return;
40  }
int magma_int_t
Definition: magmablas.h:12
#define vl(i, j)

Here is the caller graph for this function: