001: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
002: /* ///                    PLASMA auxiliary routines (version 2.1.0)                          ///
003:  * ///                    Author: Jakub Kurzak, Hatem Ltaief                                 ///
004:  * ///                    Release Date: November, 15th 2009                                  ///
005:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
006:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
007: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
008: #include "common.h"
009: 
010: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
011: //  Parallel tile QR factorization
012: #define A(m,n) &((float*)A.mat)[A.bsiz*(m)+A.bsiz*A.lmt*(n)]
013: #define T(m,n) &((float*)T.mat)[T.bsiz*(m)+T.bsiz*T.lmt*(n)]
014: void plasma_psgeqrf(plasma_context_t *plasma)
015: {
016:     PLASMA_desc A;
017:     PLASMA_desc T;
018: 
019:     int k, m, n;
020:     int next_k;
021:     int next_m;
022:     int next_n;
023:     float *work, *tau;
024: 
025:     plasma_unpack_args_2(A, T);
026:     work = (float *)plasma_private_alloc(plasma, T.mb*T.nb, T.dtyp);
027:     tau = (float *)plasma_private_alloc(plasma, A.nb, A.dtyp);
028:     ss_init(A.mt, A.nt, -1);
029: 
030:     k = 0;
031:     n = PLASMA_RANK;
032:     while (n >= A.nt) {
033:         k++;
034:         n = n-A.nt+k;
035:     }
036:     m = k;
037: 
038:     while (k < min(A.mt, A.nt) && n < A.nt) {
039:         next_n = n;
040:         next_m = m;
041:         next_k = k;
042: 
043:         next_m++;
044:         if (next_m == A.mt) {
045:             next_n += PLASMA_SIZE;
046:             while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
047:                 next_k++;
048:                 next_n = next_n-A.nt+next_k;
049:             }
050:             next_m = next_k;
051:         }
052: 
053:         if (n == k) {
054:             if (m == k) {
055:                 ss_cond_wait(k, k, k-1);
056:                 CORE_sgeqrt(
057:                     k == A.mt-1 ? A.m-k*A.nb : A.nb,
058:                     k == A.nt-1 ? A.n-k*A.nb : A.nb,
059:                     T.mb,
060:                     A(k, k), A.nb,
061:                     T(k, k), T.mb,
062:                     tau, work);
063:                 ss_cond_set(k, k, k);
064:             }
065:             else {
066:                 ss_cond_wait(m, k, k-1);
067:                 CORE_stsqrt(
068:                     m == A.mt-1 ? A.m-m*A.nb : A.nb,
069:                     k == A.nt-1 ? A.n-k*A.nb : A.nb,
070:                     T.mb,
071:                     A(k, k), A.nb,
072:                     A(m, k), A.nb,
073:                     T(m, k), T.mb,
074:                     tau, work);
075:                 ss_cond_set(m, k, k);
076:             }
077:         }
078:         else {
079:             if (m == k) {
080:                 ss_cond_wait(k, k, k);
081:                 ss_cond_wait(k, n, k-1);
082:                 CORE_sormqr(
083:                     PlasmaLeft, PlasmaTrans,
084:                     k == A.mt-1 ? A.m-k*A.nb : A.nb,
085:                     n == A.nt-1 ? A.n-n*A.nb : A.nb,
086:                     T.mb,
087:                     k == A.mt-1 ? A.m-k*A.nb : A.nb,
088:                     A(k, k), A.nb,
089:                     T(k, k), T.mb,
090:                     A(k, n), A.nb,
091:                     work, T.nb);
092:             }
093:             else {
094:                 ss_cond_wait(m, k, k);
095:                 ss_cond_wait(m, n, k-1);
096:                 CORE_sssmqr(
097:                     PlasmaLeft, PlasmaTrans,
098:                     A.nb,
099:                     m == A.mt-1 ? A.m-m*A.nb : A.nb,
100:                     n == A.nt-1 ? A.n-n*A.nb : A.nb,
101:                     T.mb,
102:                     A.nb,
103:                     A(k, n), A.nb,
104:                     A(m, n), A.nb,
105:                     A(m, k), A.nb,
106:                     T(m, k), T.mb,
107:                     work, T.mb);
108:                 ss_cond_set(m, n, k);
109:             }
110:         }
111:         n = next_n;
112:         m = next_m;
113:         k = next_k;
114:     }
115:     plasma_private_free(plasma, work);
116:     plasma_private_free(plasma, tau);
117:     ss_finalize();
118: }
119: