24 #define A(m, n) BLKADDR(A, PLASMA_Complex64_t, m, n)
26 static void CORE_zbarrier_thread(
const int thidx,
const int thcnt);
28 const int thidx,
const int thcnt,
31 const int pividx,
int *ipiv);
85 #define AMAX1BUF_SIZE (48 << 1)
95 sfmin = LAPACKE_dlamch_work(
'S');
104 double curamx =
cabs(localamx);
107 for (i = 1; i < thcnt; ++i) {
108 while (CORE_zamax1buf[i << 1] == -1.0) {
114 for (i = 1; i < thcnt; ++i) {
115 tmp = CORE_zamax1buf[ (i << 1) + 1];
116 if (
cabs(tmp) > curamx) {
127 for (i = 1; i < thcnt; ++i)
128 CORE_zamax1buf[ (i << 1) + 1] = curval;
130 CORE_zamax1buf[0] = -j - 2.0;
131 CORE_zamax1buf[1] = *diagvalue;
136 for (i = 1; i < thcnt; ++i)
137 CORE_zamax1buf[i << 1] = -3.0;
140 for (i = 1; i < thcnt; ++i) {
141 while (CORE_zamax1buf[i << 1] != -1.0) {
145 CORE_zamax1buf[0] = -1.0;
147 CORE_zamax1buf[(thidx << 1) + 1] = localamx;
148 CORE_zamax1buf[thidx << 1] = -2.0;
149 while (CORE_zamax1buf[0] == -1.0) {
151 while (CORE_zamax1buf[thidx << 1] != -3.0) {
153 *thwinner = -CORE_zamax1buf[0] - 2.0;
154 *diagvalue = CORE_zamax1buf[1];
155 *globalamx = CORE_zamax1buf[(thidx << 1) + 1];
156 CORE_zamax1buf[thidx << 1] = -1.0;
158 if (thidx == *thwinner)
161 while (CORE_zamax1buf[0] != -1.0) {
167 CORE_zbarrier_thread(
int thidx,
int thcnt) {
172 CORE_zamax1_thread( 1.0, thidx, thcnt, &idum1, &ddum1, &ddum2, 0, &idum2 );
177 const int column,
const int n1,
const int n2,
178 const int thidx,
const int thcnt,
179 const int ft,
const int lt)
182 int ip, j, it, i, ldft;
189 Atop =
A(0, 0) + column * ldft;
190 Atop2 = Atop + n1 * ldft;
195 int *lipiv = IPIV+column;
196 int idxMax = column+n1;
197 for (j = column; j < idxMax; ++j, ++lipiv) {
198 ip = (*lipiv) - offset - 1;
205 A(it, 0) + (column+n1)*ld + i, ld );
218 CORE_zbarrier_thread( thidx, thcnt );
221 L = Atop + column + n1;
222 tmpM = ldft - column - n1;
235 L =
A( ft, 0 ) + column * ld;
236 lm = ft == A.
mt-1 ? A.
m - ft * A.
mb : A.
mb;
242 CORE_zbarrier_thread( thidx, thcnt );
254 for( it = ft+1; it < lt; it++)
257 L =
A( it, 0 ) + column * ld;
258 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
270 CORE_zgetrf_rectil_rec(
const PLASMA_desc A,
int *IPIV,
int *info,
272 const int thidx,
const int thcnt,
273 const int column,
const int width,
274 const int ft,
const int lt)
276 int ld, jp, n1, n2, lm, tmpM, piv_sf;
277 int ip, j, it, i, ldft;
278 int max_i, max_it, thwin;
289 Atop =
A(0, 0) + column * ldft;
292 CORE_zbarrier_thread( thidx, thcnt );
295 fprintf(stderr,
"\n ------ column = %d / width = %d -------\n", column, width);
304 fprintf(stderr,
"\n");
307 CORE_zbarrier_thread( thidx, thcnt );
315 Atop2 = Atop + n1 * ldft;
317 CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
318 thidx, thcnt, column, n1, ft, lt );
326 int *lipiv = IPIV+column;
327 int idxMax = column+n1;
328 for (j = column; j < idxMax; ++j, ++lipiv) {
329 ip = (*lipiv) - offset - 1;
336 A(it, 0) + (column+n1)*ld + i, ld );
348 CORE_zbarrier_thread( thidx, thcnt );
350 if ( pivval == 0.0 ) {
354 if (
cabs(pivval) >= sfmin ) {
356 pivval = 1.0 / pivval;
364 L = Atop + column + n1;
365 tmpM = ldft - column - n1;
372 Atop2 = L+(n1-1)*ldft;
373 for( i=0; i < tmpM; i++, Atop2++)
374 *Atop2 = *Atop2 / pivval;
389 abstmp1 =
cabs(tmp1);
396 if ( pivval == 0.0 ) {
400 if (
cabs(pivval) >= sfmin ) {
402 pivval = 1.0 / pivval;
409 L =
A( ft, 0 ) + column * ld;
410 lm = ft == A.
mt-1 ? A.
m - ft * A.
mb : A.
mb;
421 for( i=0; i < lm; i++, Atop2++)
422 *Atop2 = *Atop2 / pivval;
427 CORE_zbarrier_thread( thidx, thcnt );
439 tmp1 = L[n1*ld+max_i];
440 abstmp1 =
cabs(tmp1);
444 for( it = ft+1; it < lt; it++)
447 L =
A( it, 0 ) + column * ld;
448 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
456 for( i=0; i < lm; i++, Atop2++)
457 *Atop2 = *Atop2 / pivval;
469 if (
cabs( L[n1*ld+jp] ) > abstmp1 ) {
471 abstmp1 =
cabs(tmp1);
477 jp = offset + max_it*A.
mb + max_i;
478 CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
479 &tmp2, pivot, jp + 1, IPIV + column + n1 );
484 if (thwin == thidx) {
485 if ( jp-offset != column+n1 )
488 Atop2 =
A( max_it, 0 ) + (column + n1 )* ld + max_i;
493 CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
494 thidx, thcnt, column+n1, n2, ft, lt );
501 int *lipiv = IPIV+column+n1;
502 int idxMax = column+width;
503 for (j = column+n1; j < idxMax; ++j, ++lipiv) {
504 ip = (*lipiv) - offset - 1;
511 A(it, 0) + column*ld + i, ld );
516 }
else if ( width == 1 ) {
527 lm = ft == A.
mt-1 ? A.
m - ft * A.
mb : A.
mb;
531 abstmp1 =
cabs(tmp1);
534 for( it = ft+1; it < lt; it++)
537 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
539 if (
cabs(Atop2[jp]) > abstmp1 ) {
541 abstmp1 =
cabs(tmp1);
547 jp = offset + max_it*A.
mb + max_i;
548 CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
549 &tmp2, pivot, jp + 1, IPIV + column );
554 if (thwin == thidx) {
555 if ( jp-offset != 0 )
557 Atop2 =
A( max_it, 0 ) + max_i;
563 CORE_zbarrier_thread( thidx, thcnt );
566 if ( column == (
min(A.
m, A.
n))-1 ) {
569 if ( pivval != 0.0 ) {
571 if (
cabs(pivval) >= sfmin ) {
572 pivval = 1.0 / pivval;
578 lm = ft == A.
mt-1 ? A.
m - ft * A.
mb : A.
mb;
581 for( it = ft+1; it < lt; it++)
584 Atop2 =
A( it, 0 ) + column * ld;
585 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
594 Atop2 = Atop + column + 1;
595 lm = ft == A.
mt-1 ? A.
m - ft * A.
mb : A.
mb;
597 for( i=0; i < lm-column-1; i++, Atop2++)
598 *Atop2 = *Atop2 / pivval;
600 for( it = ft+1; it < lt; it++)
603 Atop2 =
A( it, 0 ) + column * ld;
604 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
606 for( i=0; i < lm; i++, Atop2++)
607 *Atop2 = *Atop2 / pivval;
613 if (
cabs(pivval) >= sfmin ) {
614 pivval = 1.0 / pivval;
616 for( it = ft; it < lt; it++)
619 Atop2 =
A( it, 0 ) + column * ld;
620 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
629 for( it = ft; it < lt; it++)
632 Atop2 =
A( it, 0 ) + column * ld;
633 lm = it == A.
mt-1 ? A.
m - it * A.
mb : A.
mb;
635 for( i=0; i < lm; i++, Atop2++)
636 *Atop2 = *Atop2 / pivval;
649 #if defined(PLASMA_HAVE_WEAK)
650 #pragma weak CORE_zgetrf_rectil = PCORE_zgetrf_rectil
651 #define CORE_zgetrf_rectil PCORE_zgetrf_rectil
657 int thcnt =
min( info[2], A.
mt );
658 int minMN =
min( A.
m, A.
n );
666 if ( thidx >= thcnt )
669 int q = A.
mt / thcnt;
670 int r = A.
mt % thcnt;
677 ft = r * (q + 1) + (thidx - r) * q;
679 lt =
min( lt, A.
mt );
683 CORE_zgetrf_rectil_rec( A, IPIV, info, &pivot,
684 thidx, thcnt, 0, minMN, ft, lt);
687 CORE_zgetrf_rectil_update( A, IPIV,
714 sizeof(
int), &iinfo,
VALUE,
715 sizeof(
int), &nbthread,
VALUE,
722 #if defined(PLASMA_HAVE_WEAK)
723 #pragma weak CORE_zgetrf_rectil_quark = PCORE_zgetrf_rectil_quark
724 #define CORE_zgetrf_rectil_quark PCORE_zgetrf_rectil_quark
740 check_info, iinfo, maxthreads );
743 info[2] = maxthreads;