14 #include "common_magma.h"
29 void findVTpos(
int N,
int NB,
int Vblksiz,
int sweep,
int st,
int *Vpos,
int *TAUpos,
int *Tpos,
int *myblkid);
30 void findVTsiz(
int N,
int NB,
int Vblksiz,
int *blkcnt,
int *LDV);
43 #define dE(m,n) (dE+(m) + LDE*(n))
44 #define dV(m) (dV+(m))
45 #define dT(m) (dT+(m))
46 #define E(m,n) &(E[(m) + LDE*(n)])
47 #define V(m) &(V[(m)])
48 #define TAU(m) &(TAU[(m)])
49 #define T(m) &(T[(m)])
50 extern "C" void magma_cbulge_applyQ(
magma_int_t WANTZ,
char SIDE,
magma_int_t NE,
magma_int_t N,
magma_int_t NB,
magma_int_t Vblksiz, cuFloatComplex *
E,
magma_int_t LDE, cuFloatComplex *
V, cuFloatComplex *
TAU, cuFloatComplex *
T,
magma_int_t *INFO, cuFloatComplex *
dV, cuFloatComplex *
dT, cuFloatComplex *
dE,
magma_int_t copytype )
86 cuFloatComplex *
dwork;
89 LWORK = 2*N*
max(Vblksiz,64);
91 printf (
"!!!! magma_cbulge_applyQ magma_alloc failed for: dwork\n" );
116 printf(
" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",N,NB,Vblksiz,SIDE,versionL,versionR,WANTZ);
121 #if defined(USESTREAM)
122 static cudaStream_t stream[2];
130 for (bg = nbGblk; bg>0; bg--)
132 firstcolj = (bg-1)*Vblksiz + 1;
135 for (m = rownbm; m>0; m--)
139 colj = (bg-1)*Vblksiz;
140 fst = (rownbm -m)*NB+colj +1;
141 for (k=0; k<Vblksiz; k++)
143 colj = (bg-1)*Vblksiz + k;
144 st = (rownbm -m)*NB+colj +1;
145 ed =
min(st+NB-1,N-1);
147 if((st==ed)&&(colj!=N-2))
break;
151 colst = (bg-1)*Vblksiz;
152 findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
154 if((vlen>0)&&(vnb>0)){
157 magma_clarfb_gpu(
MagmaLeft,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, vlen, len, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(fst,colst), LDE, dwork, len);
159 magma_clarfb_gpu(
MagmaLeft,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, vlen, NE, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(fst,0), LDE, dwork, NE);
164 }
else if(versionL==114){
166 for (m = rownbm; m>0; m--)
168 ncolinvolvd =
min(N-1, m*NB);
169 avai_blksiz=
min(Vblksiz,ncolinvolvd);
171 for (n = nbgr; n>0; n--)
175 cur_blksiz =
min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
176 colst = (n-1)*avai_blksiz;
177 coled = colst + cur_blksiz -1;
178 fst = (rownbm -m)*NB+colst +1;
179 for (colj=colst; colj<=coled; colj++)
181 st = (rownbm -m)*NB+colj +1;
182 ed =
min(st+NB-1,N-1);
184 if((st==ed)&&(colj!=N-2))
break;
188 findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
190 if((vlen>0)&&(vnb>0))
191 magma_clarfb_gpu(
MagmaLeft,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, vlen, NE, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(fst,0), LDE, dwork, NE);
195 }
else if (SIDE==
'R'){
197 for (bg =1; bg<=nbGblk; bg++)
199 firstcolj = (bg-1)*Vblksiz + 1;
202 for (m = 1; m<=rownbm; m++)
207 colj = (bg-1)*Vblksiz;
208 fst = (rownbm -m)*NB+colj +1;
209 for (k=0; k<Vblksiz; k++)
211 colj = (bg-1)*Vblksiz + k;
212 st = (rownbm -m)*NB+colj +1;
213 ed =
min(st+NB-1,N-1);
215 if((st==ed)&&(colj!=N-2))
break;
219 colj = (bg-1)*Vblksiz;
220 findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
222 if((vlen>0)&&(vnb>0)){
223 #if defined(USESTREAM)
225 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, N1, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(0, fst), LDE, dwork, N1);
227 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, N2, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
229 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, NE, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(0, fst), LDE, dwork, NE);
234 }
else if(versionR==92){
236 for (m = 1; m<=rownbm; m++)
238 ncolinvolvd =
min(N-1, m*NB);
239 avai_blksiz=
min(Vblksiz,ncolinvolvd);
241 for (n = 1; n<=nbgr; n++)
245 cur_blksiz =
min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
246 colst = (n-1)*avai_blksiz;
247 coled = colst + cur_blksiz -1;
248 fst = (rownbm -m)*NB+colst +1;
249 for (colj=colst; colj<=coled; colj++)
251 st = (rownbm -m)*NB+colj +1;
252 ed =
min(st+NB-1,N-1);
254 if((st==ed)&&(colj!=N-2))
break;
258 findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
259 if((vlen>0)&&(vnb>0)){
260 #if defined(USESTREAM)
262 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, N1, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(0, fst), LDE, dwork, N1);
264 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, N2, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
266 magma_clarfb_gpu(
MagmaRight,
MagmaNoTrans,
MagmaForward,
MagmaColumnwise, NE, vlen, vnb,
dV(vpos), LDV,
dT(tpos), LDT,
dE(0, fst), LDE, dwork, NE);
273 printf(
"ERROR SIDE %d \n",SIDE);
276 #if defined(USESTREAM)