13 #include "common_magma.h"
15 #define A(i, j) (w + (j)*lda + (i))
16 #define B(i, j) (w+nb*lda + (j)*ldb + (i))
18 #define dA(i, j) (da + (j)*ldda + (i))
19 #define dB(i, j) (db + (j)*lddb + (i))
88 char uplo_[2] = {
uplo, 0};
103 if (itype<1 || itype>3){
109 }
else if (ldda <
max(1,n)) {
111 }
else if (lddb <
max(1,n)) {
133 static cudaStream_t stream[3];
148 B(0, 0), nb, stream[2] );
151 A(0, 0), nb, stream[1] );
153 for(k = 0; k<n; k+=nb){
166 dA(k, k), ldda, stream[0] );
172 dB(k+kb, k+kb), lddb,
173 B(0, 0), nb, stream[2] );
177 c_one,
dB(k,k), lddb,
184 c_neg_half,
dA(k,k), ldda,
186 c_one,
dA(k, k+kb), ldda);
190 c_neg_one,
dA(k,k+kb), ldda,
192 d_one,
dA(k+kb,k+kb), ldda);
195 dA(k+kb, k+kb), ldda,
196 A(0, 0), lda, stream[1] );
200 c_neg_half,
dA(k,k), ldda,
202 c_one,
dA(k, k+kb), ldda);
206 c_one ,
dB(k+kb,k+kb), lddb,
223 B(0, 0), nb, stream[2] );
226 A(0, 0), nb, stream[1] );
228 for(k = 0; k<n; k+=nb){
241 dA(k, k), ldda, stream[0] );
247 dB(k+kb, k+kb), lddb,
248 B(0, 0), nb, stream[2] );
252 c_one,
dB(k,k), lddb,
259 c_neg_half,
dA(k,k), ldda,
261 c_one,
dA(k+kb, k), ldda);
265 c_neg_one,
dA(k+kb,k), ldda,
267 d_one,
dA(k+kb,k+kb), ldda);
270 dA(k+kb, k+kb), ldda,
271 A(0, 0), lda, stream[1] );
275 c_neg_half,
dA(k,k), ldda,
277 c_one,
dA(k+kb, k), ldda);
281 c_one,
dB(k+kb,k+kb), lddb,
297 for(k = 0; k<n; k+=nb){
302 B(0, 0), nb, stream[2] );
309 c_one ,
dB(0,0), lddb,
314 c_half,
dA(k,k), ldda,
316 c_one,
dA(0, k), ldda);
324 A(0, 0), lda, stream[0] );
330 c_one,
dA(0,k), ldda,
332 d_one,
dA(0,0), ldda);
336 c_half,
dA(k,k), ldda,
338 c_one,
dA(0, k), ldda);
342 c_one,
dB(k,k), lddb,
354 dA(k, k), ldda, stream[1] );
364 for(k = 0; k<n; k+=nb){
369 B(0, 0), nb, stream[2] );
376 c_one ,
dB(0,0), lddb,
381 c_half,
dA(k,k), ldda,
383 c_one,
dA(k, 0), ldda);
391 A(0, 0), lda, stream[0] );
397 c_one,
dA(k,0), ldda,
399 d_one,
dA(0,0), ldda);
403 c_half,
dA(k,k), ldda,
405 c_one,
dA(k, 0), ldda);
409 c_one,
dB(k,k), lddb,
420 dA(k, k), ldda, stream[1] );