01: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
02: /* ///                    PLASMA auxiliary routines (version 2.1.0)                          ///
03:  * ///                    Author: Hatem Ltaief, Jakub Kurzak                                 ///
04:  * ///                    Release Date: November, 15th 2009                                  ///
05:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
06:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
07: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
08: #include "common.h"
09: 
10: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
11: //  Parallel application of Q using tile V - QR factorization
12: #define A(m,n) &((float*)A.mat)[A.bsiz*(m)+A.bsiz*A.lmt*(n)]
13: #define B(m,n) &((float*)B.mat)[B.bsiz*(m)+B.bsiz*B.lmt*(n)]
14: #define T(m,n) &((float*)T.mat)[T.bsiz*(m)+T.bsiz*T.lmt*(n)]
15: void plasma_psormqr(plasma_context_t *plasma)
16: {
17:     PLASMA_desc A;
18:     PLASMA_desc B;
19:     PLASMA_desc T;
20: 
21:     int k, m, n;
22:     int next_k;
23:     int next_m;
24:     int next_n;
25:     float *work;
26: 
27:     plasma_unpack_args_3(A, B, T);
28:     work = (float *)plasma_private_alloc(plasma, T.mb*T.nb, T.dtyp);
29:     ss_init(B.mt, B.nt, -1);
30: 
31:     k = 0;
32:     n = PLASMA_RANK;
33:     while (n >= B.nt) {
34:         k++;
35:         n = n-B.nt;
36:     }
37:     m = k;
38: 
39:     while (k < min(A.mt, A.nt) && n < B.nt) {
40:         next_n = n;
41:         next_m = m;
42:         next_k = k;
43: 
44:         next_m++;
45:         if (next_m == A.mt) {
46:             next_n += PLASMA_SIZE;
47:             while (next_n >= B.nt && next_k < min(A.mt, A.nt)) {
48:                 next_k++;
49:                 next_n = next_n-B.nt;
50:             }
51:             next_m = next_k;
52:         }
53: 
54:         if (m == k) {
55:             ss_cond_wait(k, n, k-1);
56:             CORE_sormqr(
57:                 PlasmaLeft, PlasmaTrans,
58:                 k == A.mt-1 ? A.m-k*A.nb : A.nb,
59:                 n == B.nt-1 ? B.n-n*B.nb : B.nb,
60:                 T.mb,
61:                 k == min(A.mt, A.nt)-1 ? min(A.m, A.n)-k*A.nb : A.nb,
62:                 A(k, k), A.nb,
63:                 T(k, k), T.mb,
64:                 B(k, n), B.nb,
65:                 work, T.nb);
66:             ss_cond_set(k, n, k);
67:         }
68:         else {
69:             ss_cond_wait(m, n, k-1);
70:             CORE_sssmqr(
71:                 PlasmaLeft, PlasmaTrans,
72:                 A.nb,
73:                 m == A.mt-1 ? A.m-m*A.nb : A.nb,
74:                 n == B.nt-1 ? B.n-n*B.nb : B.nb,
75:                 T.mb,
76:                 k == A.nt-1 ? A.n-k*A.nb : A.nb,
77:                 B(k, n), B.nb,
78:                 B(m, n), B.nb,
79:                 A(m, k), A.nb,
80:                 T(m, k), T.mb,
81:                 work, T.mb);
82:             ss_cond_set(m, n, k);
83:         }
84:         n = next_n;
85:         m = next_m;
86:         k = next_k;
87:     }
88:     plasma_private_free(plasma, work);
89:     ss_finalize();
90: }
91: