PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pcunmlq.c
Go to the documentation of this file.
1 
18 #include "common.h"
19 
20 #define A(m,n) BLKADDR(A, PLASMA_Complex32_t, m, n)
21 #define B(m,n) BLKADDR(B, PLASMA_Complex32_t, m, n)
22 #define T(m,n) BLKADDR(T, PLASMA_Complex32_t, m, n)
23 /***************************************************************************/
27 {
30  PLASMA_desc A;
31  PLASMA_desc B;
32  PLASMA_desc T;
33  PLASMA_sequence *sequence;
34  PLASMA_request *request;
35 
36  int k, m, n;
37  int next_k;
38  int next_m;
39  int next_n;
40  int ldak, ldbk, ldbm;
41  int tempmm, tempnn, tempkm, tempkmin;
42  int minMT, minM;
43  int ib = PLASMA_IB;
44  PLASMA_Complex32_t *work;
45 
46  plasma_unpack_args_7(side, trans, A, B, T, sequence, request);
47  if (sequence->status != PLASMA_SUCCESS)
48  return;
49 
50  if (side != PlasmaLeft) {
52  return;
53  }
54  if (trans != PlasmaConjTrans) {
56  return;
57  }
58 
59  work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
60  ss_init(B.mt, B.nt, min(A.mt, A.nt));
61 
62  if (A.m > A.n) {
63  minM = A.n;
64  minMT = A.nt;
65  } else {
66  minM = A.m;
67  minMT = A.mt;
68  }
69 
70  k = minMT-1;
71  n = PLASMA_RANK;
72  while (n >= B.nt) {
73  k--;
74  n = n-B.nt;
75  }
76  m = B.mt-1;
77 
78  while (k >= 0 && n < B.nt) {
79  next_n = n;
80  next_m = m;
81  next_k = k;
82 
83  next_m--;
84  if (next_m == k-1) {
85  next_n += PLASMA_SIZE;
86  while (next_n >= B.nt && next_k >= 0) {
87  next_k--;
88  next_n = next_n-B.nt;
89  }
90  next_m = B.mt-1;
91  }
92 
93  tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
94  tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
95  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
96  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
97 
98  ldak = BLKLDD(A, k);
99  ldbk = BLKLDD(B, k);
100  ldbm = BLKLDD(B, m);
101 
102  if (m == k) {
103  CORE_cunmlq(
104  side, trans,
105  tempkm, tempnn, tempkmin, ib,
106  A(k, k), ldak,
107  T(k, k), T.mb,
108  B(k, n), ldbk,
109  work, T.nb);
110  ss_cond_set(k, n, k);
111  }
112  else {
113  ss_cond_wait(m, n, k+1);
114  CORE_ctsmlq(
115  side, trans,
116  A.mb, tempnn, tempmm, tempnn, tempkmin, ib,
117  B(k, n), ldbk,
118  B(m, n), ldbm,
119  A(k, m), ldak,
120  T(k, m), T.mb,
121  work, ib);
122  ss_cond_set(m, n, k);
123  }
124  m = next_m;
125  n = next_n;
126  k = next_k;
127  }
128  plasma_private_free(plasma, work);
129  ss_finalize();
130 }
131 
132 /***************************************************************************/
137  PLASMA_sequence *sequence, PLASMA_request *request)
138 {
141 
142  int k, m, n;
143  int ldak, ldbk, ldbm;
144  int tempmm, tempnn, tempkn, tempkm, tempkmin;
145  int ib, minMT, minM;
146 
147  plasma = plasma_context_self();
148  if (sequence->status != PLASMA_SUCCESS)
149  return;
150  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
151 
152  ib = PLASMA_IB;
153  if (A.m > A.n) {
154  minM = A.n;
155  minMT = A.nt;
156  } else {
157  minM = A.m;
158  minMT = A.mt;
159  }
160 
161  if (side == PlasmaLeft ) {
162  if (trans == PlasmaNoTrans) {
163  /*
164  * PlasmaLeft / PlasmaNoTrans
165  */
166  for (k = 0; k < minMT; k++) {
167  tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb;
168  tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
169  ldak = BLKLDD(A, k);
170  ldbk = BLKLDD(B, k);
171  for (n = 0; n < B.nt; n++) {
172  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
174  plasma->quark, &task_flags,
175  side, trans,
176  tempkm, tempnn, tempkmin, ib, T.nb,
177  A(k, k), ldak,
178  T(k, k), T.mb,
179  B(k, n), ldbk);
180  }
181  for (m = k+1; m < B.mt; m++) {
182  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
183  ldbm = BLKLDD(B, m);
184  for (n = 0; n < B.nt; n++) {
185  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
187  plasma->quark, &task_flags,
188  side, trans,
189  B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
190  B(k, n), ldbk,
191  B(m, n), ldbm,
192  A(k, m), ldak,
193  T(k, m), T.mb);
194  }
195  }
196  }
197  }
198  else {
199  /*
200  * PlasmaLeft / PlasmaConjTrans
201  */
202  for (k = minMT-1; k >= 0; k--) {
203  tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb;
204  tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
205  ldak = BLKLDD(A, k);
206  ldbk = BLKLDD(B, k);
207  for (m = B.mt-1; m > k; m--) {
208  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
209  ldbm = BLKLDD(B, m);
210  for (n = 0; n < B.nt; n++) {
211  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
213  plasma->quark, &task_flags,
214  side, trans,
215  B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
216  B(k, n), ldbk,
217  B(m, n), ldbm,
218  A(k, m), ldak,
219  T(k, m), T.mb);
220  }
221  }
222  for (n = 0; n < B.nt; n++) {
223  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
225  plasma->quark, &task_flags,
226  side, trans,
227  tempkm, tempnn, tempkmin, ib, T.nb,
228  A(k, k), ldak,
229  T(k, k), T.mb,
230  B(k, n), ldbk);
231  }
232  }
233  }
234  }
235  else {
236  if (trans == PlasmaNoTrans) {
237  /*
238  * PlasmaRight / PlasmaNoTrans
239  */
240  for (k = minMT-1; k >= 0; k--) {
241  tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb;
242  tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
243  ldak = BLKLDD(A, k);
244  for (n = B.nt-1; n > k; n--) {
245  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
246  for (m = 0; m < B.mt; m++) {
247  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
248  ldbm = BLKLDD(B, m);
250  plasma->quark, &task_flags,
251  side, trans,
252  tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
253  B(m, k), ldbm,
254  B(m, n), ldbm,
255  A(k, n), ldak,
256  T(k, n), T.mb);
257  }
258  }
259  for (m = 0; m < B.mt; m++) {
260  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
261  ldbm = BLKLDD(B, m);
263  plasma->quark, &task_flags,
264  side, trans,
265  tempmm, tempkn, tempkmin, ib, T.nb,
266  A(k, k), ldak,
267  T(k, k), T.mb,
268  B(m, k), ldbm);
269  }
270  }
271  }
272  else {
273  /*
274  * PlasmaRight / PlasmaConjTrans
275  */
276  for (k = 0; k < minMT; k++) {
277  tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb;
278  tempkmin = k == minMT-1 ? minM-k*A.mb : A.mb;
279  ldak = BLKLDD(A, k);
280  for (m = 0; m < B.mt; m++) {
281  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
282  ldbm = BLKLDD(B, m);
284  plasma->quark, &task_flags,
285  side, trans,
286  tempmm, tempkn, tempkmin, ib, T.nb,
287  A(k, k), ldak,
288  T(k, k), T.mb,
289  B(m, k), ldbm);
290  }
291  for (n = k+1; n < B.nt; n++) {
292  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
293  for (m = 0; m < B.mt; m++) {
294  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
295  ldbm = BLKLDD(B, m);
297  plasma->quark, &task_flags,
298  side, trans,
299  tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
300  B(m, k), ldbm,
301  B(m, n), ldbm,
302  A(k, n), ldak,
303  T(k, n), T.mb);
304  }
305  }
306  }
307  }
308  }
309 }