MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cbulge_applyQ.cpp File Reference
#include "common_magma.h"
#include <cblas.h>
Include dependency graph for cbulge_applyQ.cpp:

Go to the source code of this file.

Macros

#define PRECISION_c
#define dE(m, n)   (dE+(m) + LDE*(n))
#define dV(m)   (dV+(m))
#define dT(m)   (dT+(m))
#define E(m, n)   &(E[(m) + LDE*(n)])
#define V(m)   &(V[(m)])
#define TAU(m)   &(TAU[(m)])
#define T(m)   &(T[(m)])

Functions

void magmablas_claset_identity (magma_int_t m, magma_int_t n, cuFloatComplex *A, magma_int_t lda)
void findVTpos (int N, int NB, int Vblksiz, int sweep, int st, int *Vpos, int *TAUpos, int *Tpos, int *myblkid)
void findVTsiz (int N, int NB, int Vblksiz, int *blkcnt, int *LDV)
magma_int_t plasma_ceildiv (magma_int_t a, magma_int_t b)
void magma_cbulge_applyQ (magma_int_t WANTZ, char SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, cuFloatComplex *E, magma_int_t LDE, cuFloatComplex *V, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t *INFO, cuFloatComplex *dV, cuFloatComplex *dT, cuFloatComplex *dE, magma_int_t copytype)

Macro Definition Documentation

#define dE (   m,
 
)    (dE+(m) + LDE*(n))

Definition at line 43 of file cbulge_applyQ.cpp.

#define dT (   m)    (dT+(m))

Definition at line 45 of file cbulge_applyQ.cpp.

#define dV (   m)    (dV+(m))

Definition at line 44 of file cbulge_applyQ.cpp.

#define E (   m,
 
)    &(E[(m) + LDE*(n)])

Definition at line 46 of file cbulge_applyQ.cpp.

#define PRECISION_c

Definition at line 18 of file cbulge_applyQ.cpp.

#define T (   m)    &(T[(m)])

Definition at line 49 of file cbulge_applyQ.cpp.

#define TAU (   m)    &(TAU[(m)])

Definition at line 48 of file cbulge_applyQ.cpp.

#define V (   m)    &(V[(m)])

Definition at line 47 of file cbulge_applyQ.cpp.


Function Documentation

void findVTpos ( int  N,
int  NB,
int  Vblksiz,
int  sweep,
int  st,
int *  Vpos,
int *  TAUpos,
int *  Tpos,
int *  myblkid 
)

Definition at line 68 of file bulge_auxiliary.cpp.

{
magma_int_t prevcolblknb, prevblkcnt, prevcolblkid;
magma_int_t curcolblknb, nbprevcolblk, mastersweep;
magma_int_t blkid, locj, LDV;
prevcolblknb = 0;
prevblkcnt = 0;
curcolblknb = 0;
nbprevcolblk = sweep/Vblksiz;
for (prevcolblkid = 0; prevcolblkid < nbprevcolblk; prevcolblkid++)
{
mastersweep = prevcolblkid * Vblksiz;
prevcolblknb = plasma_ceildiv((N-(mastersweep+2)),NB);
prevblkcnt = prevblkcnt + prevcolblknb;
}
curcolblknb = plasma_ceildiv((st-sweep),NB);
blkid = prevblkcnt + curcolblknb -1;
locj = sweep%Vblksiz;
LDV = NB + Vblksiz -1;
*myblkid= blkid;
*Vpos = blkid*Vblksiz*LDV + locj*LDV + locj;
*TAUpos = blkid*Vblksiz + locj;
*Tpos = blkid*Vblksiz*Vblksiz + locj*Vblksiz + locj;
//printf("voici blkid %d locj %d vpos %d tpos %d \n",blkid,locj,*Vpos,*Tpos);
}
void findVTsiz ( int  N,
int  NB,
int  Vblksiz,
int *  blkcnt,
int *  LDV 
)

Definition at line 97 of file bulge_auxiliary.cpp.

{
magma_int_t colblk, nbcolblk;
magma_int_t curcolblknb, mastersweep;
*blkcnt = 0;
nbcolblk = plasma_ceildiv((N-1),Vblksiz);
for (colblk = 0; colblk<nbcolblk; colblk++)
{
mastersweep = colblk * Vblksiz;
curcolblknb = plasma_ceildiv((N-(mastersweep+2)),NB);
*blkcnt = *blkcnt + curcolblknb;
//printf("voici nbcolblk %d master sweep %d blkcnt %d \n",nbcolblk, mastersweep,*blkcnt);
}
*blkcnt = *blkcnt +1;
*LDV= NB+Vblksiz-1;
}
void magma_cbulge_applyQ ( magma_int_t  WANTZ,
char  SIDE,
magma_int_t  NE,
magma_int_t  N,
magma_int_t  NB,
magma_int_t  Vblksiz,
cuFloatComplex *  E,
magma_int_t  LDE,
cuFloatComplex *  V,
cuFloatComplex *  TAU,
cuFloatComplex *  T,
magma_int_t INFO,
cuFloatComplex *  dV,
cuFloatComplex *  dT,
cuFloatComplex *  dE,
magma_int_t  copytype 
)

Definition at line 50 of file cbulge_applyQ.cpp.

References dE, dT, dV, dwork, findVTpos(), findVTsiz(), magma_clarfb_gpu(), magma_cmalloc(), magma_csetmatrix(), magma_queue_create(), magma_queue_destroy(), MAGMA_SUCCESS, magmablas_claset_identity(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, MagmaLeft, MagmaNoTrans, MagmaRight, max, min, and plasma_ceildiv().

{
//%===========================
//% local variables
//%===========================
magma_int_t LDT,LDV,blklen,firstcolj;
magma_int_t bg, nbGblk,rownbm, k, m, n;
magma_int_t st,ed,fst,vlen,vnb,colj,len;
magma_int_t blkid, vpos,taupos,tpos;
cuFloatComplex *WORK;
magma_int_t LWORK;
magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
magma_int_t nbgr, colst, coled, versionL,versionR;
magma_int_t blkcnt=-1;
*INFO=0;
versionL = 114;
versionR = 92;
LDT = Vblksiz;
LDV = NB+Vblksiz-1;
blklen = LDV*Vblksiz;
nbGblk = plasma_ceildiv((N-1),Vblksiz);
//WORK = (cuFloatComplex *) malloc (LWORK*sizeof(cuFloatComplex));
/* find the size of the matrix T V*/
findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
/* Copy E & V & T to the GPU in dE and dV and dT
* depending on copytype:
* 1: mean copy only V
* 2: mean copy V and T
* 3: mean copy V, T and E
* */
if(copytype>0) magma_csetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
if(copytype>1) magma_csetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
if(copytype>2) magma_csetmatrix( N, NE, E, N, dE, N );
cuFloatComplex *dwork;
magma_int_t ldwork;
ldwork = NE;
LWORK = 2*N*max(Vblksiz,64);
if(MAGMA_SUCCESS != magma_cmalloc( &dwork, LWORK )) {
printf ("!!!! magma_cbulge_applyQ magma_alloc failed for: dwork\n" );
exit(-1);
}
/* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
* Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
/* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n
* Also E is splitten by col meaning each apply consist in a block of col (vertical block) */
/* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
* So we use the reverse order from small q to large one,
* so from q_n to q_1 so Left update to Identity.
* Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
* WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
* */
if(WANTZ==1)
{
versionL=113;
SIDE='L';
//set the matrix to Identity here to avoid copying it from the CPU
}
printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",N,NB,Vblksiz,SIDE,versionL,versionR,WANTZ);
magma_int_t N2=N/2;
magma_int_t N1=N-N2;
#if defined(USESTREAM)
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
#endif
if(SIDE=='L'){
if(versionL==113){
for (bg = nbGblk; bg>0; bg--)
{
firstcolj = (bg-1)*Vblksiz + 1;
rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
for (m = rownbm; m>0; m--)
{
vlen = 0;
vnb = 0;
colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
fst = (rownbm -m)*NB+colj +1;
for (k=0; k<Vblksiz; k++)
{
colj = (bg-1)*Vblksiz + k;
st = (rownbm -m)*NB+colj +1;
ed = min(st+NB-1,N-1);
if(st>ed)break;
if((st==ed)&&(colj!=N-2))break;
vlen=ed-fst+1;
vnb=k+1;
}
colst = (bg-1)*Vblksiz;
findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
//printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
if((vlen>0)&&(vnb>0)){
if(WANTZ==1){
len = N-colst;
magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
}else{
magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
}
}
}
}
}else if(versionL==114){
rownbm = plasma_ceildiv((N-1),NB);
for (m = rownbm; m>0; m--)
{
ncolinvolvd = min(N-1, m*NB);
avai_blksiz=min(Vblksiz,ncolinvolvd);
nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
for (n = nbgr; n>0; n--)
{
vlen = 0;
vnb = 0;
cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
colst = (n-1)*avai_blksiz;
coled = colst + cur_blksiz -1;
fst = (rownbm -m)*NB+colst +1;
for (colj=colst; colj<=coled; colj++)
{
st = (rownbm -m)*NB+colj +1;
ed = min(st+NB-1,N-1);
if(st>ed)break;
if((st==ed)&&(colj!=N-2))break;
vlen=ed-fst+1;
vnb=vnb+1;
}
findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
//printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colst+1,vpos+1,taupos+1);
if((vlen>0)&&(vnb>0))
magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
}
}
}
}else if (SIDE=='R'){
if(versionR==91){
for (bg =1; bg<=nbGblk; bg++)
{
firstcolj = (bg-1)*Vblksiz + 1;
rownbm = plasma_ceildiv((N-(firstcolj+1)),NB);
if(bg==nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)),NB); // last blk has size=1 used for complex to handle A(N,N-1)
for (m = 1; m<=rownbm; m++)
{
vlen = 0;
vnb = 0;
// for k=0;I compute the fst and then can remove it from the loop
colj = (bg-1)*Vblksiz;
fst = (rownbm -m)*NB+colj +1;
for (k=0; k<Vblksiz; k++)
{
colj = (bg-1)*Vblksiz + k;
st = (rownbm -m)*NB+colj +1;
ed = min(st+NB-1,N-1);
if(st>ed)break;
if((st==ed)&&(colj!=N-2))break;
vlen=ed-fst+1;
vnb=k+1;
}
colj = (bg-1)*Vblksiz;
findVTpos(N,NB,Vblksiz,colj,fst, &vpos, &taupos, &tpos, &blkid);
//printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n",bg,m,vlen, vnb,colj,vpos,taupos);
if((vlen>0)&&(vnb>0)){
#if defined(USESTREAM)
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
#else
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
#endif
}
}
}
}else if(versionR==92){
rownbm = plasma_ceildiv((N-1),NB);
for (m = 1; m<=rownbm; m++)
{
ncolinvolvd = min(N-1, m*NB);
avai_blksiz=min(Vblksiz,ncolinvolvd);
nbgr = plasma_ceildiv(ncolinvolvd,avai_blksiz);
for (n = 1; n<=nbgr; n++)
{
vlen = 0;
vnb = 0;
cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
colst = (n-1)*avai_blksiz;
coled = colst + cur_blksiz -1;
fst = (rownbm -m)*NB+colst +1;
for (colj=colst; colj<=coled; colj++)
{
st = (rownbm -m)*NB+colj +1;
ed = min(st+NB-1,N-1);
if(st>ed)break;
if((st==ed)&&(colj!=N-2))break;
vlen=ed-fst+1;
vnb=vnb+1;
}
findVTpos(N,NB,Vblksiz,colst,fst, &vpos, &taupos, &tpos, &blkid);
if((vlen>0)&&(vnb>0)){
#if defined(USESTREAM)
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
#else
magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
#endif
}
}
}
}
}else{
printf("ERROR SIDE %d \n",SIDE);
}
#if defined(USESTREAM)
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
#endif
}

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_claset_identity ( magma_int_t  m,
magma_int_t  n,
cuFloatComplex *  A,
magma_int_t  lda 
)

Here is the caller graph for this function:

magma_int_t plasma_ceildiv ( magma_int_t  a,
magma_int_t  b 
)

Definition at line 36 of file bulge_auxiliary.cpp.

{
r = (r-(magma_int_t)r)==0? (magma_int_t)r:(magma_int_t)r+1;
return (magma_int_t) r;
}