15 #include "common_magma.h"
17 #define A(i, j) ( a+(j)*lda + (i))
18 #define C(i, j) ( c+(j)*ldc + (i))
20 #define dC(gpui, i, j) (dw[gpui]+(j)*lddc + (i))
21 #define dA_c(gpui, ind, i, j) (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
22 #define dA_r(gpui, ind, i, j) (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
23 #define dt(gpui, ind) (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)
24 #define dwork(gpui, ind) (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)
126 char side_[2] = {
side, 0};
127 char trans_[2] = {
trans, 0};
136 cuDoubleComplex t[4160];
149 lquery = (lwork == -1);
167 }
else if (k < 0 || k > nq) {
169 }
else if (lda <
max(1,nq)) {
171 }
else if (ldc <
max(1,m)) {
173 }
else if (lwork <
max(1,nw) && ! lquery) {
182 lwkopt =
max(1,nw) * nb;
195 if (m == 0 || n == 0 || k == 0) {
207 for (igpu = 0; igpu < nrgpu; ++igpu){
210 printf(
"%d: size: %ld\n", igpu, (n_l*lddc + 2*lddac*lddar + (nb+1+lddwork)*nb)*
sizeof(cuDoubleComplex));
223 c, &ldc, work, &lwork, &iinfo);
231 for (igpu = 0; igpu < nrgpu; ++igpu){
233 kb =
min(n_l, n-igpu*n_l);
236 dC(igpu, 0, 0), lddc, stream[igpu][0] );
244 i1 = (k - 1) / nb * nb;
250 for (igpu = 0; igpu < nrgpu; ++igpu){
254 dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
258 for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
270 for (igpu = 0; igpu < nrgpu; ++igpu){
274 dt(igpu, ind_c), ib, stream[igpu][ind_c] );
280 kb =
min(nb, k - i - i3);
281 if (kb > 0 && i+i3 >= 0){
282 for (igpu = 0; igpu < nrgpu; ++igpu){
286 dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
290 for (igpu = 0; igpu < nrgpu; ++igpu){
299 dA_c(igpu, ind_c, i, 0), lddac,
dt(igpu, ind_c), ib,
300 dC(igpu, i, 0), lddc,
301 dwork(igpu, ind_c), lddwork);
308 for (igpu = 0; igpu < nrgpu; ++igpu){
312 kb =
min(n_l, n-igpu*n_l);
314 dC(igpu, 0, 0), lddc,
315 C(0, igpu*n_l), ldc, stream[igpu][0] );
320 fprintf(stderr,
"The case (side == right) is not implemented\n");
370 for (igpu = 0; igpu < nrgpu; ++igpu){