PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
core_zgetrf_rectil.c
Go to the documentation of this file.
1 
19 #include <math.h>
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "common.h"
23 
24 #define A(m, n) BLKADDR(A, PLASMA_Complex64_t, m, n)
25 
26 static void CORE_zbarrier_thread(const int thidx, const int thcnt);
27 static void CORE_zamax1_thread(const PLASMA_Complex64_t localamx,
28  const int thidx, const int thcnt,
29  int *thwinner, PLASMA_Complex64_t *diagvalue,
30  PLASMA_Complex64_t *globalamx,
31  const int pividx, int *ipiv);
32 
33 /***************************************************************************/
85 #define AMAX1BUF_SIZE (48 << 1)
86 
87 /* 48 threads should be enough for everybody */
88 static volatile PLASMA_Complex64_t CORE_zamax1buf[AMAX1BUF_SIZE];
89 static double sfmin;
90 
91 void
93  int i;
94  for (i = 0; i < AMAX1BUF_SIZE; ++i) CORE_zamax1buf[i] = -1.0;
95  sfmin = LAPACKE_dlamch_work('S');
96 }
97 
98 static void
99 CORE_zamax1_thread(PLASMA_Complex64_t localamx, int thidx, int thcnt, int *thwinner,
100  PLASMA_Complex64_t *diagvalue, PLASMA_Complex64_t *globalamx, int pividx, int *ipiv) {
101  if (thidx == 0) {
102  int i, j = 0;
103  PLASMA_Complex64_t curval = localamx, tmp;
104  double curamx = cabs(localamx);
105 
106  /* make sure everybody filled in their value */
107  for (i = 1; i < thcnt; ++i) {
108  while (CORE_zamax1buf[i << 1] == -1.0) { /* wait for thread i to store its value */
109  }
110  }
111 
112  /* better not fuse the loop above and below to make sure data is sync'd */
113 
114  for (i = 1; i < thcnt; ++i) {
115  tmp = CORE_zamax1buf[ (i << 1) + 1];
116  if (cabs(tmp) > curamx) {
117  curamx = cabs(tmp);
118  curval = tmp;
119  j = i;
120  }
121  }
122 
123  if (0 == j)
124  ipiv[0] = pividx;
125 
126  /* make sure everybody knows the amax value */
127  for (i = 1; i < thcnt; ++i)
128  CORE_zamax1buf[ (i << 1) + 1] = curval;
129 
130  CORE_zamax1buf[0] = -j - 2.0; /* set the index of the winning thread */
131  CORE_zamax1buf[1] = *diagvalue; /* set the index of the winning thread */
132 
133  *thwinner = j;
134  *globalamx = curval;
135 
136  for (i = 1; i < thcnt; ++i)
137  CORE_zamax1buf[i << 1] = -3.0;
138 
139  /* make sure everybody read the max value */
140  for (i = 1; i < thcnt; ++i) {
141  while (CORE_zamax1buf[i << 1] != -1.0) {
142  }
143  }
144 
145  CORE_zamax1buf[0] = -1.0;
146  } else {
147  CORE_zamax1buf[(thidx << 1) + 1] = localamx;
148  CORE_zamax1buf[thidx << 1] = -2.0; /* announce to thread 0 that local amax was stored */
149  while (CORE_zamax1buf[0] == -1.0) { /* wait for thread 0 to finish calculating the global amax */
150  }
151  while (CORE_zamax1buf[thidx << 1] != -3.0) { /* wait for thread 0 to store amax */
152  }
153  *thwinner = -CORE_zamax1buf[0] - 2.0;
154  *diagvalue = CORE_zamax1buf[1];
155  *globalamx = CORE_zamax1buf[(thidx << 1) + 1]; /* read the amax from the location adjacent to the one in the above loop */
156  CORE_zamax1buf[thidx << 1] = -1.0; /* signal thread 0 that this thread is done reading */
157 
158  if (thidx == *thwinner)
159  ipiv[0] = pividx;
160 
161  while (CORE_zamax1buf[0] != -1.0) { /* wait for thread 0 to finish */
162  }
163  }
164 }
165 
166 static void
167 CORE_zbarrier_thread(int thidx, int thcnt) {
168  int idum1, idum2;
169  PLASMA_Complex64_t ddum1 = 0.;
170  PLASMA_Complex64_t ddum2 = 0.;
171  /* it's probably faster to implement a dedicated barrier */
172  CORE_zamax1_thread( 1.0, thidx, thcnt, &idum1, &ddum1, &ddum2, 0, &idum2 );
173 }
174 
175 static void
176 CORE_zgetrf_rectil_update(const PLASMA_desc A, int *IPIV,
177  const int column, const int n1, const int n2,
178  const int thidx, const int thcnt,
179  const int ft, const int lt)
180 {
181  int ld, lm, tmpM;
182  int ip, j, it, i, ldft;
183  PLASMA_Complex64_t zone = 1.0;
184  PLASMA_Complex64_t mzone = -1.0;
185  PLASMA_Complex64_t *Atop, *Atop2, *U, *L;
186  int offset = A.i;
187 
188  ldft = BLKLDD(A, 0);
189  Atop = A(0, 0) + column * ldft;
190  Atop2 = Atop + n1 * ldft;
191 
192  if (thidx == 0)
193  {
194  /* Swap to the right */
195  int *lipiv = IPIV+column;
196  int idxMax = column+n1;
197  for (j = column; j < idxMax; ++j, ++lipiv) {
198  ip = (*lipiv) - offset - 1;
199  if ( ip != j )
200  {
201  it = ip / A.mb;
202  i = ip % A.mb;
203  ld = BLKLDD(A, it);
204  cblas_zswap(n2, Atop2 + j, ldft,
205  A(it, 0) + (column+n1)*ld + i, ld );
206  }
207  }
208 
209  /* Trsm on the upper part */
210  U = Atop2 + column;
213  n1, n2, CBLAS_SADDR(zone),
214  Atop + column, ldft,
215  U, ldft );
216 
217  /* Signal to other threads that they can start update */
218  CORE_zbarrier_thread( thidx, thcnt );
219 
220  /* First tile */
221  L = Atop + column + n1;
222  tmpM = ldft - column - n1;
223 
224  /* Apply the GEMM */
226  tmpM, n2, n1,
227  CBLAS_SADDR(mzone), L, ldft,
228  U, ldft,
229  CBLAS_SADDR(zone), U + n1, ldft );
230 
231  }
232  else
233  {
234  ld = BLKLDD( A, ft );
235  L = A( ft, 0 ) + column * ld;
236  lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
237 
238  U = Atop2 + column;
239 
240  /* Wait for pivoting and triangular solve to be finished
241  * before to really start the update */
242  CORE_zbarrier_thread( thidx, thcnt );
243 
244  /* First tile */
245  /* Apply the GEMM */
247  lm, n2, n1,
248  CBLAS_SADDR(mzone), L, ld,
249  U, ldft,
250  CBLAS_SADDR(zone), L + n1*ld, ld );
251  }
252 
253  /* Update the other blocks */
254  for( it = ft+1; it < lt; it++)
255  {
256  ld = BLKLDD( A, it );
257  L = A( it, 0 ) + column * ld;
258  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
259 
260  /* Apply the GEMM */
262  lm, n2, n1,
263  CBLAS_SADDR(mzone), L, ld,
264  U, ldft,
265  CBLAS_SADDR(zone), L + n1*ld, ld );
266  }
267 }
268 
269 static void
270 CORE_zgetrf_rectil_rec(const PLASMA_desc A, int *IPIV, int *info,
271  PLASMA_Complex64_t *pivot,
272  const int thidx, const int thcnt,
273  const int column, const int width,
274  const int ft, const int lt)
275 {
276  int ld, jp, n1, n2, lm, tmpM, piv_sf;
277  int ip, j, it, i, ldft;
278  int max_i, max_it, thwin;
279  PLASMA_Complex64_t zone = 1.0;
280  PLASMA_Complex64_t mzone = -1.0;
281  PLASMA_Complex64_t tmp1;
282  PLASMA_Complex64_t tmp2;
283  PLASMA_Complex64_t pivval;
284  PLASMA_Complex64_t *Atop, *Atop2, *U, *L;
285  double abstmp1;
286  int offset = A.i;
287 
288  ldft = BLKLDD(A, 0);
289  Atop = A(0, 0) + column * ldft;
290 
291 #if 0
292  CORE_zbarrier_thread( thidx, thcnt );
293  if (thidx == 0)
294  {
295  fprintf(stderr, "\n ------ column = %d / width = %d -------\n", column, width);
296  int i, j;
297  for(j=0;j<4;j++){
298  for(i=0;i<4;i++){
299  fprintf(stderr, "%e ", ((PLASMA_Complex64_t*)A(0, 0))[j*4+i]);
300  }
301  for(i=0;i<4;i++){
302  fprintf(stderr, "%e ", ((PLASMA_Complex64_t*)A(1, 0))[j*4+i]);
303  }
304  fprintf(stderr, "\n");
305  }
306  }
307  CORE_zbarrier_thread( thidx, thcnt );
308 #endif
309 
310  if ( width > 1 ) {
311  /* Assumption: N = min( M, N ); */
312  n1 = width / 2;
313  n2 = width - n1;
314 
315  Atop2 = Atop + n1 * ldft;
316 
317  CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
318  thidx, thcnt, column, n1, ft, lt );
319 
320  if ( *info != 0 )
321  return;
322 
323  if (thidx == 0)
324  {
325  /* Swap to the right */
326  int *lipiv = IPIV+column;
327  int idxMax = column+n1;
328  for (j = column; j < idxMax; ++j, ++lipiv) {
329  ip = (*lipiv) - offset - 1;
330  if ( ip != j )
331  {
332  it = ip / A.mb;
333  i = ip % A.mb;
334  ld = BLKLDD(A, it);
335  cblas_zswap(n2, Atop2 + j, ldft,
336  A(it, 0) + (column+n1)*ld + i, ld );
337  }
338  }
339  /* Trsm on the upper part */
340  U = Atop2 + column;
343  n1, n2, CBLAS_SADDR(zone),
344  Atop + column, ldft,
345  U, ldft );
346 
347  /* SIgnal to other threads that they can start update */
348  CORE_zbarrier_thread( thidx, thcnt );
349  pivval = *pivot;
350  if ( pivval == 0.0 ) {
351  *info = column+n1;
352  return;
353  } else {
354  if ( cabs(pivval) >= sfmin ) {
355  piv_sf = 1;
356  pivval = 1.0 / pivval;
357  } else {
358  piv_sf = 0;
359  }
360  }
361 
362  /* First tile */
363  {
364  L = Atop + column + n1;
365  tmpM = ldft - column - n1;
366 
367  /* Scale last column of L */
368  if ( piv_sf ) {
369  cblas_zscal( tmpM, CBLAS_SADDR(pivval), L+(n1-1)*ldft, 1 );
370  } else {
371  int i;
372  Atop2 = L+(n1-1)*ldft;
373  for( i=0; i < tmpM; i++, Atop2++)
374  *Atop2 = *Atop2 / pivval;
375  }
376 
377  /* Apply the GEMM */
379  tmpM, n2, n1,
380  CBLAS_SADDR(mzone), L, ldft,
381  U, ldft,
382  CBLAS_SADDR(zone), U + n1, ldft );
383 
384  /* Search Max in first column of U+n1 */
385  tmp2 = U[n1];
386  max_it = ft;
387  max_i = cblas_izamax( tmpM, U+n1, 1 ) + n1;
388  tmp1 = U[max_i];
389  abstmp1 = cabs(tmp1);
390  max_i += column;
391  }
392  }
393  else
394  {
395  pivval = *pivot;
396  if ( pivval == 0.0 ) {
397  *info = column+n1;
398  return;
399  } else {
400  if ( cabs(pivval) >= sfmin ) {
401  piv_sf = 1;
402  pivval = 1.0 / pivval;
403  } else {
404  piv_sf = 0;
405  }
406  }
407 
408  ld = BLKLDD( A, ft );
409  L = A( ft, 0 ) + column * ld;
410  lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
411 
412  U = Atop2 + column;
413 
414  /* First tile */
415  /* Scale last column of L */
416  if ( piv_sf ) {
417  cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 );
418  } else {
419  int i;
420  Atop2 = L+(n1-1)*ld;
421  for( i=0; i < lm; i++, Atop2++)
422  *Atop2 = *Atop2 / pivval;
423  }
424 
425  /* Wait for pivoting and triangular solve to be finished
426  * before to really start the update */
427  CORE_zbarrier_thread( thidx, thcnt );
428 
429  /* Apply the GEMM */
431  lm, n2, n1,
432  CBLAS_SADDR(mzone), L, ld,
433  U, ldft,
434  CBLAS_SADDR(zone), L + n1*ld, ld );
435 
436  /* Search Max in first column of L+n1*ld */
437  max_it = ft;
438  max_i = cblas_izamax( lm, L+n1*ld, 1 );
439  tmp1 = L[n1*ld+max_i];
440  abstmp1 = cabs(tmp1);
441  }
442 
443  /* Update the other blocks */
444  for( it = ft+1; it < lt; it++)
445  {
446  ld = BLKLDD( A, it );
447  L = A( it, 0 ) + column * ld;
448  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
449 
450  /* Scale last column of L */
451  if ( piv_sf ) {
452  cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 );
453  } else {
454  int i;
455  Atop2 = L+(n1-1)*ld;
456  for( i=0; i < lm; i++, Atop2++)
457  *Atop2 = *Atop2 / pivval;
458  }
459 
460  /* Apply the GEMM */
462  lm, n2, n1,
463  CBLAS_SADDR(mzone), L, ld,
464  U, ldft,
465  CBLAS_SADDR(zone), L + n1*ld, ld );
466 
467  /* Search the max on the first column of L+n1*ld */
468  jp = cblas_izamax( lm, L+n1*ld, 1 );
469  if ( cabs( L[n1*ld+jp] ) > abstmp1 ) {
470  tmp1 = L[n1*ld+jp];
471  abstmp1 = cabs(tmp1);
472  max_i = jp;
473  max_it = it;
474  }
475  }
476 
477  jp = offset + max_it*A.mb + max_i;
478  CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
479  &tmp2, pivot, jp + 1, IPIV + column + n1 );
480 
481  if ( thidx == 0 ) {
482  U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */
483  }
484  if (thwin == thidx) { /* the thread that owns the best pivot */
485  if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */
486  {
487  ld = BLKLDD(A, max_it);
488  Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i;
489  *Atop2 = tmp2;
490  }
491  }
492 
493  CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
494  thidx, thcnt, column+n1, n2, ft, lt );
495  if ( *info != 0 )
496  return;
497 
498  if ( thidx == 0 )
499  {
500  /* Swap to the left */
501  int *lipiv = IPIV+column+n1;
502  int idxMax = column+width;
503  for (j = column+n1; j < idxMax; ++j, ++lipiv) {
504  ip = (*lipiv) - offset - 1;
505  if ( ip != j )
506  {
507  it = ip / A.mb;
508  i = ip % A.mb;
509  ld = BLKLDD(A, it);
510  cblas_zswap(n1, Atop + j, ldft,
511  A(it, 0) + column*ld + i, ld );
512  }
513  }
514  }
515 
516  } else if ( width == 1 ) {
517 
518  /* Search maximum for column 0 */
519  if ( column == 0 )
520  {
521  if ( thidx == 0 )
522  tmp2 = Atop[column];
523 
524  /* First tmp1 */
525  ld = BLKLDD(A, ft);
526  Atop2 = A( ft, 0 );
527  lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
528  max_it = ft;
529  max_i = cblas_izamax( lm, Atop2, 1 );
530  tmp1 = Atop2[max_i];
531  abstmp1 = cabs(tmp1);
532 
533  /* Update */
534  for( it = ft+1; it < lt; it++)
535  {
536  Atop2= A( it, 0 );
537  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
538  jp = cblas_izamax( lm, Atop2, 1 );
539  if ( cabs(Atop2[jp]) > abstmp1 ) {
540  tmp1 = Atop2[jp];
541  abstmp1 = cabs(tmp1);
542  max_i = jp;
543  max_it = it;
544  }
545  }
546 
547  jp = offset + max_it*A.mb + max_i;
548  CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
549  &tmp2, pivot, jp + 1, IPIV + column );
550 
551  if ( thidx == 0 ) {
552  Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */
553  }
554  if (thwin == thidx) { /* the thread that owns the best pivot */
555  if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */
556  {
557  Atop2 = A( max_it, 0 ) + max_i;
558  *Atop2 = tmp2;
559  }
560  }
561  }
562 
563  CORE_zbarrier_thread( thidx, thcnt );
564 
565  /* If it is the last column, we just scale */
566  if ( column == (min(A.m, A.n))-1 ) {
567 
568  pivval = *pivot;
569  if ( pivval != 0.0 ) {
570  if ( thidx == 0 ) {
571  if ( cabs(pivval) >= sfmin ) {
572  pivval = 1.0 / pivval;
573 
574  /*
575  * We guess than we never enter the function with m == A.mt-1
576  * because it means that there is only one thread
577  */
578  lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
579  cblas_zscal( lm - column - 1, CBLAS_SADDR(pivval), Atop+column+1, 1 );
580 
581  for( it = ft+1; it < lt; it++)
582  {
583  ld = BLKLDD(A, it);
584  Atop2 = A( it, 0 ) + column * ld;
585  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
586  cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 );
587  }
588  } else {
589  /*
590  * We guess than we never enter the function with m == A.mt-1
591  * because it means that there is only one thread
592  */
593  int i;
594  Atop2 = Atop + column + 1;
595  lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
596 
597  for( i=0; i < lm-column-1; i++, Atop2++)
598  *Atop2 = *Atop2 / pivval;
599 
600  for( it = ft+1; it < lt; it++)
601  {
602  ld = BLKLDD(A, it);
603  Atop2 = A( it, 0 ) + column * ld;
604  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
605 
606  for( i=0; i < lm; i++, Atop2++)
607  *Atop2 = *Atop2 / pivval;
608  }
609  }
610  }
611  else
612  {
613  if ( cabs(pivval) >= sfmin ) {
614  pivval = 1.0 / pivval;
615 
616  for( it = ft; it < lt; it++)
617  {
618  ld = BLKLDD(A, it);
619  Atop2 = A( it, 0 ) + column * ld;
620  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
621  cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 );
622  }
623  } else {
624  /*
625  * We guess than we never enter the function with m == A.mt-1
626  * because it means that there is only one thread
627  */
628  int i;
629  for( it = ft; it < lt; it++)
630  {
631  ld = BLKLDD(A, it);
632  Atop2 = A( it, 0 ) + column * ld;
633  lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
634 
635  for( i=0; i < lm; i++, Atop2++)
636  *Atop2 = *Atop2 / pivval;
637  }
638  }
639  }
640  }
641  else {
642  *info = column + 1;
643  return;
644  }
645  }
646  }
647 }
648 
649 #if defined(PLASMA_HAVE_WEAK)
650 #pragma weak CORE_zgetrf_rectil = PCORE_zgetrf_rectil
651 #define CORE_zgetrf_rectil PCORE_zgetrf_rectil
652 #endif
653 int CORE_zgetrf_rectil(const PLASMA_desc A, int *IPIV, int *info)
654 {
655  int ft, lt;
656  int thidx = info[1];
657  int thcnt = min( info[2], A.mt );
658  int minMN = min( A.m, A.n );
659  PLASMA_Complex64_t pivot;
660 
661  if ( A.nt > 1 ) {
662  coreblas_error(1, "Illegal value of A.nt");
663  return -1;
664  }
665 
666  if ( thidx >= thcnt )
667  return 0;
668 
669  int q = A.mt / thcnt;
670  int r = A.mt % thcnt;
671 
672  if (thidx < r) {
673  q++;
674  ft = thidx * q;
675  lt = ft + q;
676  } else {
677  ft = r * (q + 1) + (thidx - r) * q;
678  lt = ft + q;
679  lt = min( lt, A.mt );
680  }
681 
682  info[0] = 0;
683  CORE_zgetrf_rectil_rec( A, IPIV, info, &pivot,
684  thidx, thcnt, 0, minMN, ft, lt);
685 
686  if ( A.n > minMN ) {
687  CORE_zgetrf_rectil_update( A, IPIV,
688  0, minMN, A.n-minMN,
689  thidx, thcnt,
690  ft, lt);
691  }
692 
693  return info[0];
694 }
695 
696 /***************************************************************************/
700  PLASMA_desc A, PLASMA_Complex64_t *Amn, int size,
701  int *IPIV,
702  PLASMA_sequence *sequence, PLASMA_request *request,
703  PLASMA_bool check_info, int iinfo,
704  int nbthread)
705 {
707  QUARK_Insert_Task(quark, CORE_zgetrf_rectil_quark, task_flags,
708  sizeof(PLASMA_desc), &A, VALUE,
709  sizeof(PLASMA_Complex64_t)*size, Amn, INOUT,
710  sizeof(int)*A.n, IPIV, OUTPUT,
711  sizeof(PLASMA_sequence*), &sequence, VALUE,
712  sizeof(PLASMA_request*), &request, VALUE,
713  sizeof(PLASMA_bool), &check_info, VALUE,
714  sizeof(int), &iinfo, VALUE,
715  sizeof(int), &nbthread, VALUE,
716  0);
717 }
718 
719 /***************************************************************************/
722 #if defined(PLASMA_HAVE_WEAK)
723 #pragma weak CORE_zgetrf_rectil_quark = PCORE_zgetrf_rectil_quark
724 #define CORE_zgetrf_rectil_quark PCORE_zgetrf_rectil_quark
725 #endif
727 {
728  PLASMA_desc A;
729  PLASMA_Complex64_t *Amn;
730  int *IPIV;
731  PLASMA_sequence *sequence;
732  PLASMA_request *request;
733  PLASMA_bool check_info;
734  int iinfo;
735 
736  int info[3];
737  int maxthreads;
738 
739  quark_unpack_args_8(quark, A, Amn, IPIV, sequence, request,
740  check_info, iinfo, maxthreads );
741 
742  info[1] = QUARK_Get_RankInTask(quark);
743  info[2] = maxthreads;
744 
745  CORE_zgetrf_rectil( A, IPIV, info );
746  if (info[1] == 0 && info[0] != PLASMA_SUCCESS && check_info)
747  plasma_sequence_flush(quark, sequence, request, iinfo + info[0] );
748 }