MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cbulge_applyQ.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011 The University of Tennessee and The University
3  * of Tennessee Research Foundation. All rights
4  * reserved.
5  *
6  *
7  * @author Azzam Haidar
8  * @author Stan Tomov
9  *
10  * @generated c Thu May 10 22:27:08 2012
11  *
12  */
13 
14 #include "common_magma.h"
15 //#include "magma_cbulgeinc.h"
16 #include <cblas.h>
17 // === Define what BLAS to use ============================================
18 #define PRECISION_c
19 
20 // === End defining what BLAS to use ======================================
21 
22 
23 
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
28  cuFloatComplex *A, magma_int_t lda);
29  void findVTpos(int N, int NB, int Vblksiz, int sweep, int st, int *Vpos, int *TAUpos, int *Tpos, int *myblkid);
30  void findVTsiz(int N, int NB, int Vblksiz, int *blkcnt, int *LDV);
32 
33 #ifdef __cplusplus
34 }
35 #endif
36 
37 
39 
40 
41 
43 #define dE(m,n) (dE+(m) + LDE*(n))
44 #define dV(m) (dV+(m))
45 #define dT(m) (dT+(m))
46 #define E(m,n) &(E[(m) + LDE*(n)])
47 #define V(m) &(V[(m)])
48 #define TAU(m) &(TAU[(m)])
49 #define T(m) &(T[(m)])
50 extern "C" void magma_cbulge_applyQ(magma_int_t WANTZ, char SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, cuFloatComplex *E, magma_int_t LDE, cuFloatComplex *V, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t *INFO, cuFloatComplex *dV, cuFloatComplex *dT, cuFloatComplex *dE, magma_int_t copytype )
51 {
52 
53  //%===========================
54  //% local variables
55  //%===========================
56  magma_int_t LDT,LDV,blklen,firstcolj;
57  magma_int_t bg, nbGblk,rownbm, k, m, n;
58  magma_int_t st,ed,fst,vlen,vnb,colj,len;
59  magma_int_t blkid, vpos,taupos,tpos;
60  cuFloatComplex *WORK;
61  magma_int_t LWORK;
62  magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
63  magma_int_t nbgr, colst, coled, versionL,versionR;
64  magma_int_t blkcnt=-1;
65 
66  *INFO=0;
67  versionL = 114;
68  versionR = 92;
69  LDT = Vblksiz;
70  LDV = NB+Vblksiz-1;
71  blklen = LDV*Vblksiz;
72  nbGblk = plasma_ceildiv((N-1),Vblksiz);
73  //WORK = (cuFloatComplex *) malloc (LWORK*sizeof(cuFloatComplex));
74 
75  /* find the size of the matrix T V*/
76  findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
77  /* Copy E & V & T to the GPU in dE and dV and dT
78  * depending on copytype:
79  * 1: mean copy only V
80  * 2: mean copy V and T
81  * 3: mean copy V, T and E
82  * */
83  if(copytype>0) magma_csetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
84  if(copytype>1) magma_csetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
85  if(copytype>2) magma_csetmatrix( N, NE, E, N, dE, N );
86  cuFloatComplex *dwork;
87  magma_int_t ldwork;
88  ldwork = NE;
89  LWORK = 2*N*max(Vblksiz,64);
90  if(MAGMA_SUCCESS != magma_cmalloc( &dwork, LWORK )) {
91  printf ("!!!! magma_cbulge_applyQ magma_alloc failed for: dwork\n" );
92  exit(-1);
93  }
94 
95  /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
96  * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
97  /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
98  * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */
99 
100  /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
101  * So we use the reverse order from small q to large one,
102  * so from q_n to q_1 so Left update to Identity.
103  * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
104  * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
105  * */
106  if(WANTZ==1)
107  {
108  versionL=113;
109  SIDE='L';
110  //set the matrix to Identity here to avoid copying it from the CPU
111  magmablas_claset_identity(N, N, dE, N);
112  }
113 
114 
115 
116  printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",N,NB,Vblksiz,SIDE,versionL,versionR,WANTZ);
117 
118 
119  magma_int_t N2=N/2;
120  magma_int_t N1=N-N2;
121 #if defined(USESTREAM)
122  static cudaStream_t stream[2];
123  magma_queue_create( &stream[0] );
124  magma_queue_create( &stream[1] );
125 #endif
126 
127 
128  if(SIDE=='L'){
129  if(versionL==113){
130  for (bg = nbGblk; bg>0; bg--)
131  {
132  firstcolj = (bg-1)*Vblksiz + 1;
133  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
134  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
135  for (m = rownbm; m>0; m--)
136  {
137  vlen = 0;
138  vnb = 0;
139  colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
140  fst = (rownbm -m)*NB+colj +1;
141  for (k=0; k<Vblksiz; k++)
142  {
143  colj = (bg-1)*Vblksiz + k;
144  st = (rownbm -m)*NB+colj +1;
145  ed = min(st+NB-1,N-1);
146  if(st>ed)break;
147  if((st==ed)&&(colj!=N-2))break;
148  vlen=ed-fst+1;
149  vnb=k+1;
150  }
151  colst = (bg-1)*Vblksiz;
152  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
153  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
154  if((vlen>0)&&(vnb>0)){
155  if(WANTZ==1){
156  len = N-colst;
157  magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
158  }else{
159  magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
160  }
161  }
162  }
163  }
164  }else if(versionL==114){
165  rownbm = plasma_ceildiv((N-1),NB);
166  for (m = rownbm; m>0; m--)
167  {
168  ncolinvolvd = min(N-1, m*NB);
169  avai_blksiz=min(Vblksiz,ncolinvolvd);
170  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
171  for (n = nbgr; n>0; n--)
172  {
173  vlen = 0;
174  vnb = 0;
175  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
176  colst = (n-1)*avai_blksiz;
177  coled = colst + cur_blksiz -1;
178  fst = (rownbm -m)*NB+colst +1;
179  for (colj=colst; colj<=coled; colj++)
180  {
181  st = (rownbm -m)*NB+colj +1;
182  ed = min(st+NB-1,N-1);
183  if(st>ed)break;
184  if((st==ed)&&(colj!=N-2))break;
185  vlen=ed-fst+1;
186  vnb=vnb+1;
187  }
188  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
189  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
190  if((vlen>0)&&(vnb>0))
191  magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
192  }
193  }
194  }
195  }else if (SIDE=='R'){
196  if(versionR==91){
197  for (bg =1; bg<=nbGblk; bg++)
198  {
199  firstcolj = (bg-1)*Vblksiz + 1;
200  rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
201  if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
202  for (m = 1; m<=rownbm; m++)
203  {
204  vlen = 0;
205  vnb = 0;
206  // for k=0;I compute the fst and then can remove it from the loop
207  colj = (bg-1)*Vblksiz;
208  fst = (rownbm -m)*NB+colj +1;
209  for (k=0; k<Vblksiz; k++)
210  {
211  colj = (bg-1)*Vblksiz + k;
212  st = (rownbm -m)*NB+colj +1;
213  ed = min(st+NB-1,N-1);
214  if(st>ed)break;
215  if((st==ed)&&(colj!=N-2))break;
216  vlen=ed-fst+1;
217  vnb=k+1;
218  }
219  colj = (bg-1)*Vblksiz;
220  findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
221  //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colj,vpos,taupos);
222  if((vlen>0)&&(vnb>0)){
223  #if defined(USESTREAM)
224  magmablasSetKernelStream(stream[0]);
225  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
226  magmablasSetKernelStream(stream[1]);
227  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
228  #else
229  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
230  #endif
231  }
232  }
233  }
234  }else if(versionR==92){
235  rownbm = plasma_ceildiv((N-1),NB);
236  for (m = 1; m<=rownbm; m++)
237  {
238  ncolinvolvd = min(N-1, m*NB);
239  avai_blksiz=min(Vblksiz,ncolinvolvd);
240  nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
241  for (n = 1; n<=nbgr; n++)
242  {
243  vlen = 0;
244  vnb = 0;
245  cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
246  colst = (n-1)*avai_blksiz;
247  coled = colst + cur_blksiz -1;
248  fst = (rownbm -m)*NB+colst +1;
249  for (colj=colst; colj<=coled; colj++)
250  {
251  st = (rownbm -m)*NB+colj +1;
252  ed = min(st+NB-1,N-1);
253  if(st>ed)break;
254  if((st==ed)&&(colj!=N-2))break;
255  vlen=ed-fst+1;
256  vnb=vnb+1;
257  }
258  findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
259  if((vlen>0)&&(vnb>0)){
260  #if defined(USESTREAM)
261  magmablasSetKernelStream(stream[0]);
262  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
263  magmablasSetKernelStream(stream[1]);
264  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
265  #else
266  magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
267  #endif
268  }
269  }
270  }
271  }
272  }else{
273  printf("ERROR SIDE %d \n",SIDE);
274  }
275 
276 #if defined(USESTREAM)
278  magma_queue_destroy( stream[0] );
279  magma_queue_destroy( stream[1] );
280 #endif
281 
282 
283 }
284 #undef E
285 #undef V
286 #undef TAU
287 #undef T
288 #undef dE
289 #undef dV
290 #undef dT
291 
292 
293 
294