MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
magma_ds.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

magma_int_t magma_dsgesv_gpu (char trans, magma_int_t N, magma_int_t NRHS, double *dA, magma_int_t ldda, magma_int_t *IPIV, magma_int_t *dIPIV, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info)
magma_int_t magma_dsgetrs_gpu (char trans, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, float *dSX, magma_int_t *info)
magma_int_t magma_dsposv_gpu (char uplo, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info)
magma_int_t magma_dsgeqrsv_gpu (magma_int_t M, magma_int_t N, magma_int_t NRHS, double *dA, magma_int_t ldda, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info)

Function Documentation

magma_int_t magma_dsgeqrsv_gpu ( magma_int_t  M,
magma_int_t  N,
magma_int_t  NRHS,
double *  dA,
magma_int_t  ldda,
double *  dB,
magma_int_t  lddb,
double *  dX,
magma_int_t  lddx,
magma_int_t iter,
magma_int_t info 
)

Definition at line 17 of file dsgeqrsv_gpu.cpp.

References __func__, BWDMAX, dT, ITERMAX, lapackf77_dlamch, lapackf77_dlange(), MAGMA_D_NEG_ONE, MAGMA_D_ONE, magma_dgemm(), magma_dgemv(), magma_dgeqrf_gpu(), magma_dgeqrs_gpu(), magma_dgetmatrix(), magma_dmalloc(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_get_dgeqrf_nb(), magma_get_sgeqrf_nb(), magma_idamax(), magma_sgeqrf_gpu(), magma_sgeqrs_gpu(), magma_smalloc(), MAGMA_SUCCESS, magma_xerbla(), magmablas_dlacpy(), magmablas_dlag2s(), magmablas_dlange(), magmablas_dsaxpycp(), magmablas_slag2d(), MagmaNoTrans, MagmaUpperLower, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
DSGEQRSV solves the least squares problem
min || A*X - B ||,
where A is an M-by-N matrix and X and B are M-by-NRHS matrices.
DSGEQRSV first attempts to factorize the matrix in SINGLE PRECISION
and use this factorization within an iterative refinement procedure
to produce a solution with DOUBLE PRECISION norm-wise backward error
quality (see below). If the approach fails the method switches to a
DOUBLE PRECISION factorization and solve.
The iterative refinement is not going to be a winning strategy if
the ratio SINGLE PRECISION performance over DOUBLE PRECISION
performance is too small. A reasonable strategy should take the
number of right-hand sides and the size of the matrix into account.
This might be done with a call to ILAENV in the future. Up to now, we
always try iterative refinement.
The iterative refinement process is stopped if
ITER > ITERMAX
or for all the RHS we have:
RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
where
o ITER is the number of the current iteration in the iterative
refinement process
o RNRM is the infinity-norm of the residual
o XNRM is the infinity-norm of the solution
o ANRM is the infinity-operator-norm of the matrix A
o EPS is the machine epsilon returned by DLAMCH('Epsilon')
The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input or input/output) DOUBLE PRECISION array, dimension (ldda,N)
On entry, the M-by-N coefficient matrix A.
On exit, if iterative refinement has been successfully used
(info.EQ.0 and ITER.GE.0, see description below), A is
unchanged. If double precision factorization has been used
(info.EQ.0 and ITER.LT.0, see description below), then the
array A contains the QR factorization of A as returned by
function DGEQRF_GPU.
ldda (input) INTEGER
The leading dimension of the array A. ldda >= max(1,M).
B (input) DOUBLE PRECISION array, dimension (LDB,NRHS)
The M-by-NRHS right hand side matrix B.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,M).
X (output) DOUBLE PRECISION array, dimension (LDX,NRHS)
If info = 0, the N-by-NRHS solution matrix X.
LDX (input) INTEGER
The leading dimension of the array X. LDX >= max(1,N).
WORK (workspace) DOUBLE PRECISION array, dimension (N*NRHS)
This array is used to hold the residual vectors.
SWORK (workspace) REAL array, dimension (M*(N+NRHS))
This array is used to store the single precision matrix and the
right-hand sides or solutions in single precision.
ITER (output) INTEGER
< 0: iterative refinement has failed, double precision
factorization has been performed
-1 : the routine fell back to full precision for
implementation- or machine-specific reasons
-2 : narrowing the precision induced an overflow,
the routine fell back to full precision
-3 : failure of SGETRF
-31: stop the iterative refinement after the 30th
iterations
> 0: iterative refinement has been successfully used.
Returns the number of iterations
info (output) INTEGER
= 0: successful exit
< 0: if info = -i, the i-th argument had an illegal value
TAU (output) REAL array, dimension (N)
On exit, TAU(i) contains the scalar factor of the elementary
reflector H(i), as returned by magma_sgeqrf_gpu.
LWORK (input) INTEGER
The dimension of the array H_WORK. LWORK >= (M+N+NB)*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
H_WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
Higher performance is achieved if H_WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
D_WORK (workspace/output) REAL array on the GPU, dimension 2*N*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
It starts with NB*NB blocks that store the triangular T
matrices, followed by the NB*NB blocks of the diagonal
inverses for the R matrix.
TAU_D (output) DOUBLE REAL array, dimension (N)
On exit, if the matrix had to be factored in double precision,
TAU(i) contains the scalar factor of the elementary
reflector H(i), as returned by magma_dgeqrf_gpu.
LWORK_D (input) INTEGER
The dimension of the array H_WORK_D. LWORK_D >= (M+N+NB)*NB,
where NB can be obtained through magma_get_dgeqrf_nb(M).
H_WORK_D (workspace/output) DOUBLE REAL array, dimension (MAX(1,LWORK_D))
This memory is unattached if the iterative refinement worked,
otherwise it is used as workspace to factor the matrix in
double precision. Higher performance is achieved if H_WORK_D is
in pinned memory, e.g. allocated using magma_malloc_host.
D_WORK_D (workspace/output) DOUBLE REAL array on the GPU, dimension 2*N*NB,
where NB can be obtained through magma_get_dgeqrf_nb(M).
This memory is unattached if the iterative refinement worked,
otherwise it is used as workspace to factor the matrix in
double precision. It starts with NB*NB blocks that store the
triangular T matrices, followed by the NB*NB blocks of the
diagonal inverses for the R matrix.
===================================================================== */
double c_neg_one = MAGMA_D_NEG_ONE;
double c_one = MAGMA_D_ONE;
magma_int_t ione = 1;
double *dworkd, *hworkd;
float *dworks, *hworks;
double *dR, *tau, *dT;
float *dSA, *dSX, *dST, *stau;
double Xnrmv, Rnrmv;
double Anrm, Xnrm, Rnrm, cte, eps;
magma_int_t i, j, iiter, nb, lhwork, minmn, size;
/*
Check The Parameters.
*/
*iter = 0 ;
*info = 0 ;
if ( N < 0 )
*info = -1;
else if(NRHS<0)
*info = -3;
else if( ldda < max(1,N))
*info = -5;
else if( lddb < max(1,N))
*info = -7;
else if( lddx < max(1,N))
*info = -9;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
if( N == 0 || NRHS == 0 )
return *info;
minmn= min(M, N);
/*
* Allocate temporary buffers
*/
/* dworks(dSA + dSX + dST) */
size = ldda*N + N*NRHS + ( 2*minmn + ((N+31)/32)*32 )*nb;
if (MAGMA_SUCCESS != magma_smalloc( &dworks, size )) {
fprintf(stderr, "Allocation of dworks failed (%d)\n", size);
return *info;
}
dSA = dworks;
dSX = dSA + ldda*N;
dST = dSX + N*NRHS;
/* dworkd(dR) = N*NRHS */
size = N*NRHS;
if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, size )) {
magma_free( dworks );
fprintf(stderr, "Allocation of dworkd failed\n");
return *info;
}
dR = dworkd;
/* hworks(stau + workspace for cgeqrs) = min(M,N) + lhworks */
lhwork = nb*max((M-N+nb+2*(NRHS)), 1);
lhwork = max(lhwork, N*nb); /* We hope that magma nb is bigger than lapack nb to have enough memory in workspace */
size = minmn + lhwork;
hworks = (float*) malloc( size * sizeof(float) );
if( hworks == NULL ) {
magma_free( dworks );
magma_free( dworkd );
fprintf(stderr, "Allocation of hworks failed\n");
return *info;
}
stau = hworks;
hworks += minmn;
eps = lapackf77_dlamch("Epsilon");
Anrm = magmablas_dlange('I', M, N, dA, ldda, (double*)dworkd );
cte = Anrm * eps * pow((double)N, 0.5) * BWDMAX ;
/*
* Convert to single precision
*/
magmablas_dlag2s(N, NRHS, dB, lddb, dSX, N, info );
if( *info != 0 ) {
*iter = -2; goto L40;
}
magmablas_dlag2s(N, N, dA, ldda, dSA, ldda, info );
if(*info !=0){
*iter = -2; goto L40;
}
// In an ideal version these variables should come from user.
magma_sgeqrf_gpu(M, N, dSA, ldda, stau, dST, info);
if( *info != 0 ) {
*iter = -3; goto L40;
}
magma_sgeqrs_gpu(M, N, NRHS, dSA, ldda, stau, dST, dSX, N, hworks, lhwork, info);
// dX = dSX
magmablas_slag2d(N, NRHS, dSX, N, dX, lddx, info);
// dR = dB
magmablas_dlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dR, N);
// dR = dB - dA * dX
if( NRHS == 1 )
c_neg_one, dA, ldda,
dX, 1,
c_one, dR, 1);
else
c_neg_one, dA, ldda,
dX, lddx,
c_one, dR, N );
for(i=0; i<NRHS; i++){
j = magma_idamax( N, dX+i*N, 1);
magma_dgetmatrix( 1, 1, dX+i*N+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( N, dR+i*N, 1 );
magma_dgetmatrix( 1, 1, dR+i*N+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > Xnrm *cte ) goto L10;
}
*iter = 0;
/* Free workspaces */
magma_free( dworks );
magma_free( dworkd );
free(stau);
return *info;
L10:
for(iiter=1; iiter<ITERMAX; ) {
*info = 0 ;
/* Convert R from double precision to single precision
and store the result in SX.
Solve the system SA*SX = SR.
-- These two Tasks are merged here. */
// make SWORK = WORK ... residuals...
magmablas_dlag2s( N, NRHS, dR, N, dSX, N, info );
magma_sgeqrs_gpu( M, N, NRHS, dSA, ldda, stau, dST, dSX, N, hworks, lhwork, info);
if( *info != 0 ){
*iter = -3; goto L40;
}
for(i=0; i<NRHS; i++) {
magmablas_dsaxpycp( dSX+i*N, dX+i*lddx, N, dB+i*lddb, dR+i*N );
}
/* unnecessary may be */
magmablas_dlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dR, N);
if( NRHS == 1 )
c_neg_one, dA, ldda,
dX, 1,
c_one, dR, 1);
else
c_neg_one, dA, ldda,
dX, lddx,
c_one, dR, N);
/* Check whether the NRHS normwise backward errors satisfy the
stopping criterion. If yes, set ITER=IITER>0 and return. */
for(i=0;i<NRHS;i++)
{
j = magma_idamax( N, dX+i*N, 1);
magma_dgetmatrix( 1, 1, dX+i*N+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( N, dR+i*N, 1 );
magma_dgetmatrix( 1, 1, dR+i*N+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > Xnrm *cte ) goto L20;
}
/* If we are here, the NRHS normwise backward errors satisfy
the stopping criterion, we are good to exit. */
*iter = iiter ;
/* Free workspaces */
magma_free( dworks );
magma_free( dworkd );
free(stau);
return *info;
L20:
iiter++;
}
/* If we are at this place of the code, this is because we have
performed ITER=ITERMAX iterations and never satisified the
stopping criterion, set up the ITER flag accordingly and follow
up on double precision routine. */
*iter = -ITERMAX - 1 ;
L40:
magma_free( dworks );
/*
* Allocate temporary buffers
*/
/* dworkd(dT + tau) = min_mn + min_mn*nb*3 */
size = minmn * (3 * nb + 1);
if ( size > (N*NRHS) ) {
magma_free( dworkd );
if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, size )) {
fprintf(stderr, "Allocation of dworkd2 failed\n");
return *info;
}
}
tau = dworkd;
dT = tau + minmn;
/* hworks(stau + workspace for cgeqrs) = min(M,N) + lhworks */
if ( (2*lhwork) > (minmn+lhwork) ) {
free(stau);
hworks = (float*) malloc( lhwork * sizeof(double) );
if( hworks == NULL ) {
magma_free( dworkd );
fprintf(stderr, "Allocation of hworkd2 failed\n");
return *info;
}
}
hworkd = (double*) hworks;
/* Single-precision iterative refinement failed to converge to a
satisfactory solution, so we resort to double precision. */
magma_dgeqrf_gpu(M, N, dA, ldda, tau, dT, info);
if ( *info == 0 ) {
magmablas_dlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dX, lddx);
magma_dgeqrs_gpu(M, N, NRHS, dA, ldda, tau, dT, dX, lddx, hworkd, lhwork, info);
}
magma_free( dworkd );
free(hworkd);
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_dsgesv_gpu ( char  trans,
magma_int_t  N,
magma_int_t  NRHS,
double *  dA,
magma_int_t  ldda,
magma_int_t IPIV,
magma_int_t dIPIV,
double *  dB,
magma_int_t  lddb,
double *  dX,
magma_int_t  lddx,
double *  dworkd,
float *  dworks,
magma_int_t iter,
magma_int_t info 
)

Definition at line 17 of file dsgesv_gpu.cpp.

References __func__, BWDMAX, ITERMAX, lapackf77_dlamch, lapackf77_dlange(), MAGMA_D_NEG_ONE, MAGMA_D_ONE, magma_dgemm(), magma_dgemv(), magma_dgetmatrix(), magma_dgetrf_gpu(), magma_dgetrs_gpu(), magma_dsgetrs_gpu(), magma_idamax(), magma_setvector(), magma_sgetrf_gpu(), magma_xerbla(), magmablas_daxpycp(), magmablas_dlacpy(), magmablas_dlag2s(), magmablas_dlange(), MagmaNoTrans, MagmaUpperLower, max, and swp2pswp().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
DSGESV computes the solution to a real system of linear equations
A * X = B or A' * X = B
where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
DSGESV first attempts to factorize the matrix in SINGLE PRECISION
and use this factorization within an iterative refinement procedure
to produce a solution with DOUBLE PRECISION norm-wise backward error
quality (see below). If the approach fails the method switches to a
DOUBLE PRECISION factorization and solve.
The iterative refinement is not going to be a winning strategy if
the ratio SINGLE PRECISION performance over DOUBLE PRECISION
performance is too small. A reasonable strategy should take the
number of right-hand sides and the size of the matrix into account.
This might be done with a call to ILAENV in the future. Up to now, we
always try iterative refinement.
The iterative refinement process is stopped if
ITER > ITERMAX
or for all the RHS we have:
RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
where
o ITER is the number of the current iteration in the iterative
refinement process
o RNRM is the infinity-norm of the residual
o XNRM is the infinity-norm of the solution
o ANRM is the infinity-operator-norm of the matrix A
o EPS is the machine epsilon returned by DLAMCH('Epsilon')
The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
Arguments
=========
TRANS (input) CHARACTER*1
Specifies the form of the system of equations:
= 'N': A * X = B (No transpose)
= 'T': A'* X = B (Transpose)
= 'C': A'* X = B (ugate transpose = Transpose)
N (input) INTEGER
The number of linear equations, i.e., the order of the
matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
dA (input or input/output) DOUBLE PRECISION array, dimension (ldda,N)
On entry, the N-by-N coefficient matrix A.
On exit, if iterative refinement has been successfully used
(info.EQ.0 and ITER.GE.0, see description below), A is
unchanged. If double precision factorization has been used
(info.EQ.0 and ITER.LT.0, see description below), then the
array A contains the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
ldda (input) INTEGER
The leading dimension of the array A. ldda >= max(1,N).
IPIV (output) INTEGER array, dimension (N)
The pivot indices that define the permutation matrix P;
row i of the matrix was interchanged with row IPIV(i).
Corresponds either to the single precision factorization
(if info.EQ.0 and ITER.GE.0) or the double precision
factorization (if info.EQ.0 and ITER.LT.0).
dIPIV (output) INTEGER array on the GPU, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was moved to row IPIV(i).
dB (input) DOUBLE PRECISION array, dimension (lddb,NRHS)
The N-by-NRHS right hand side matrix B.
lddb (input) INTEGER
The leading dimension of the array B. lddb >= max(1,N).
dX (output) DOUBLE PRECISION array, dimension (lddx,NRHS)
If info = 0, the N-by-NRHS solution matrix X.
lddx (input) INTEGER
The leading dimension of the array X. lddx >= max(1,N).
dworkd (workspace) DOUBLE PRECISION array, dimension (N*NRHS)
This array is used to hold the residual vectors.
dworks (workspace) SINGLE PRECISION array, dimension (N*(N+NRHS))
This array is used to store the single precision matrix and the
right-hand sides or solutions in single precision.
iter (output) INTEGER
< 0: iterative refinement has failed, double precision
factorization has been performed
-1 : the routine fell back to full precision for
implementation- or machine-specific reasons
-2 : narrowing the precision induced an overflow,
the routine fell back to full precision
-3 : failure of SGETRF
-31: stop the iterative refinement after the 30th
iterations
> 0: iterative refinement has been successfully used.
Returns the number of iterations
info (output) INTEGER
= 0: successful exit
< 0: if info = -i, the i-th argument had an illegal value
> 0: if info = i, U(i,i) computed in DOUBLE PRECISION is
exactly zero. The factorization has been completed,
but the factor U is exactly singular, so the solution
could not be computed.
===================================================================== */
double c_neg_one = MAGMA_D_NEG_ONE;
double c_one = MAGMA_D_ONE;
magma_int_t ione = 1;
double cte, eps;
double Xnrmv, Rnrmv;
float *dSA, *dSX;
double Anrm, Xnrm, Rnrm;
magma_int_t i, j, iiter;
/*
Check The Parameters.
*/
*info = *iter = 0 ;
if ( N <0)
*info = -1;
else if(NRHS<0)
*info =-2;
else if(ldda < max(1,N))
*info =-4;
else if( lddb < max(1,N))
*info =-8;
else if( lddx < max(1,N))
*info =-10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
if( N == 0 || NRHS == 0 )
return *info;
eps = lapackf77_dlamch("Epsilon");
Anrm = magmablas_dlange('I', N, N, dA, ldda, (double*)dworkd );
cte = Anrm * eps * pow((double)N,0.5) * BWDMAX;
dSX = dworks;
dSA = dworks+N*NRHS;
if(*info !=0){
*iter = -2 ;
printf("magmablas_dlag2s\n");
goto L40;
}
magmablas_dlag2s(N, N, dA, ldda, dSA, N, info ); // Merge with DLANGE /
if(*info !=0){
*iter = -2 ;
printf("magmablas_dlag2s\n");
goto L40;
}
magma_sgetrf_gpu(N, N, dSA, N, IPIV, info);
// Generate parallel pivots
{
int *newipiv = (int*)malloc(N * sizeof(int));
swp2pswp(trans, N, IPIV, newipiv);
magma_setvector( N, sizeof(int), newipiv, 1, dIPIV, 1 );
free(newipiv);
}
if(info[0] !=0){
*iter = -3 ;
goto L40;
}
magma_dsgetrs_gpu(trans, N, NRHS, dSA, N, dIPIV, dB, lddb, dX, lddx, dSX, info);
magmablas_dlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dworkd, N);
/* TODO: update optimisation from gemv_MLU into classic gemv */
if ( NRHS == 1 )
magma_dgemv( trans, N, N, c_neg_one, dA, ldda, dX, 1, c_one, dworkd, 1);
else
magma_dgemm( trans, MagmaNoTrans, N, NRHS, N, c_neg_one, dA, ldda, dX, lddx, c_one, dworkd, N);
for(i=0;i<NRHS;i++)
{
j = magma_idamax( N, dX+i*lddx, 1) ;
magma_dgetmatrix( 1, 1, dX+i*lddx+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( N, dworkd+i*N, 1 );
magma_dgetmatrix( 1, 1, dworkd+i*N+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > (Xnrm*cte) ){
goto L10;
}
}
*iter = 0;
return *info;
L10:
;
for(iiter=1;iiter<ITERMAX;)
{
*info = 0 ;
/*
Convert R (in dworkd) from double precision to single precision
and store the result in SX.
Solve the system SA * X = dworkd and store the result in dworkd.
-- These two Tasks are merged here.
*/
magma_dsgetrs_gpu(trans, N, NRHS, dSA, N, dIPIV, dworkd, lddb, dworkd, N, dSX, info);
if(info[0] != 0){
*iter = -3 ;
goto L40;
}
/* Add the correction (dworkd) to dX and make dworkd = dB.
This saves going through dworkd a second time (if done with one more kernel). */
for(i=0;i<NRHS;i++){
magmablas_daxpycp(dworkd+i*N, dX+i*lddx, N, dB+i*lddb);
}
//magmablas_dlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dworkd, N);
if( NRHS == 1 )
/* TODO: update optimisation from gemv_MLU into classic gemv */
magma_dgemv( trans, N, N, c_neg_one, dA, ldda, dX, 1, c_one, dworkd, 1);
else
magma_dgemm( trans, MagmaNoTrans, N, NRHS, N, c_neg_one, dA, ldda, dX, lddx, c_one, dworkd, N);
/*
Check whether the NRHS normwise backward errors satisfy the
stopping criterion. If yes, set ITER=IITER>0 and return.
*/
for(i=0;i<NRHS;i++)
{
j = magma_idamax( N, dX+i*lddx, 1) ;
magma_dgetmatrix( 1, 1, dX+i*lddx+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( N, dworkd+i*N, 1 );
magma_dgetmatrix( 1, 1, dworkd+i*N+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > Xnrm *cte ){
goto L20;
}
}
/*
If we are here, the NRHS normwise backward errors satisfy the
stopping criterion, we are good to exit.
*/
*iter = iiter ;
return *info;
L20:
iiter++ ;
}
/*
If we are at this place of the code, this is because we have
performed ITER=ITERMAX iterations and never satisified the
stopping criterion, set up the ITER flag accordingly and follow up
on double precision routine.
*/
*iter = -ITERMAX - 1 ;
L40:
/*
Single-precision iterative refinement failed to converge to a
satisfactory solution, so we resort to double precision.
*/
if( *info != 0 ){
return *info;
}
magma_dgetrf_gpu( N, N, dA, ldda, IPIV, info );
if( *info == 0 ){
magmablas_dlacpy( MagmaUpperLower, N, NRHS, dB, lddb, dX, lddx );
magma_dgetrs_gpu( trans, N, NRHS, dA, ldda, IPIV, dX, lddx, info );
}
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_dsgetrs_gpu ( char  trans,
magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
magma_int_t ipiv,
double *  dB,
magma_int_t  lddb,
double *  dX,
magma_int_t  lddx,
float *  dSX,
magma_int_t info 
)

Definition at line 21 of file dsgetrs_gpu.cpp.

References __func__, lapackf77_lsame, MAGMA_S_ONE, magma_strsm, magma_xerbla(), magmablas_dlag2s(), magmablas_dslaswp(), magmablas_slag2d(), MagmaLeft, MagmaLower, MagmaNonUnit, MagmaNoTrans, MagmaTrans, MagmaUnit, MagmaUpper, and trans.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
DSGETRS solves a system of linear equations
A * X = B or A' * X = B
with a general N-by-N matrix A using the LU factorization computed
by MAGMA_SGETRF_GPU. B and X are in DOUBLE PRECISION, and A in in SINGLE PRECISION.
This routine is used in the mixed precision iterative solver
magma_dsgesv.
Arguments
=========
TRANS (input) CHARACTER*1
Specifies the form of the system of equations:
= 'N': A * X = B (No transpose)
= 'T': A'* X = B (Transpose)
= 'C': A'* X = B (ugate transpose = Transpose)
N (input) INTEGER
The order of the matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
dA (input) SINGLE PRECISION array on the GPU, dimension (LDDA,N)
The factors L and U from the factorization A = P*L*U
as computed by CGETRF_GPU.
LDDA (input) INTEGER
The leading dimension of the array dA. LDDA >= max(1,N).
IPIV (input) INTEGER array on the GPU, dimension (N)
The pivot indices from CGETRF_GPU; Row i of the
matrix was moved to row IPIV(i).
dB (input) DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS)
On entry, the right hand side matrix B.
LDDB (input) INTEGER
The leading dimension of the arrays X and B. LDDB >= max(1,N).
dX (output) DOUBLE PRECISION array on the GPU, dimension (LDDX, NRHS)
On exit, the solution matrix dX.
LDDX (input) INTEGER
The leading dimension of the array dX, LDDX >= max(1,N).
dSX (workspace) SINGLE PRECISION array on the GPU used as workspace,
dimension (N, NRHS)
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
float cone = MAGMA_S_ONE;
char trans_[2] = {trans, 0};
long int notran = lapackf77_lsame(trans_, "N");
*info = 0;
if ( (! notran) &&
(! lapackf77_lsame(trans_, "T")) &&
(! lapackf77_lsame(trans_, "C")) ) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (nrhs < 0) {
*info = -3;
} else if (ldda < n) {
*info = -5;
} else if (lddb < n) {
*info = -8;
} else if (lddx < n) {
*info = -10;
} else if (lddx != lddb) { /* TODO: remove it when dslaswp will have the correct interface */
*info = -10;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (n == 0 || nrhs == 0) {
return *info;
}
if (notran) {
inc = 1;
/* Get X by row applying interchanges to B and cast to single */
/*
* TODO: clean dslaswp interface to have interface closer to zlaswp
*/
//magmablas_dslaswp(nrhs, dB, lddb, dSX, lddbx, 1, n, ipiv);
magmablas_dslaswp(nrhs, dB, lddb, dSX, n, ipiv, inc);
/* Solve L*X = B, overwriting B with SX. */
n, nrhs, cone, dA, ldda, dSX, n);
/* Solve U*X = B, overwriting B with X. */
n, nrhs, cone, dA, ldda, dSX, n);
magmablas_slag2d(n, nrhs, dSX, n, dX, lddx, info );
} else {
inc = -1;
/* Cast the DOUBLE PRECISION RHS to SINGLE PRECISION */
magmablas_dlag2s(n, nrhs, dB, lddb, dSX, n, info );
/* Solve A' * X = B. */
n, nrhs, cone, dA, ldda, dSX, n);
n, nrhs, cone, dA, ldda, dSX, n);
magmablas_dslaswp(nrhs, dX, lddx, dSX, n, ipiv, inc);
}
return *info;
/* End of MAGMA_DSGETRS */
} /* magma_dsgetrs */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_dsposv_gpu ( char  uplo,
magma_int_t  n,
magma_int_t  nrhs,
double *  dA,
magma_int_t  ldda,
double *  dB,
magma_int_t  lddb,
double *  dX,
magma_int_t  lddx,
double *  dworkd,
float *  dworks,
magma_int_t iter,
magma_int_t info 
)

Definition at line 21 of file dsposv_gpu.cpp.

References __func__, BWDMAX, ITERMAX, lapackf77_dlamch, lapackf77_dlange(), MAGMA_D_NEG_ONE, MAGMA_D_ONE, magma_dgetmatrix(), magma_dpotrf_gpu(), magma_dpotrs_gpu(), magma_dsymm(), magma_dsymv, magma_idamax(), magma_spotrf_gpu(), magma_spotrs_gpu(), magma_xerbla(), magmablas_dlacpy(), magmablas_dlag2s(), magmablas_dlat2s(), magmablas_dsaxpycp(), magmablas_slag2d(), magmablas_slansy(), MagmaLeft, MagmaUpperLower, and max.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
DSPOSV computes the solution to a real system of linear equations
A * X = B,
where A is an N-by-N symmetric positive definite matrix and X and B
are N-by-NRHS matrices.
DSPOSV first attempts to factorize the matrix in real SINGLE PRECISION
and use this factorization within an iterative refinement procedure
to produce a solution with real DOUBLE PRECISION norm-wise backward error
quality (see below). If the approach fails the method switches to a
real DOUBLE PRECISION factorization and solve.
The iterative refinement is not going to be a winning strategy if
the ratio real SINGLE PRECISION performance over DOUBLE PRECISION
performance is too small. A reasonable strategy should take the
number of right-hand sides and the size of the matrix into account.
This might be done with a call to ILAENV in the future. Up to now, we
always try iterative refinement.
The iterative refinement process is stopped if
ITER > ITERMAX
or for all the RHS we have:
RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
where
o ITER is the number of the current iteration in the iterative
refinement process
o RNRM is the infinity-norm of the residual
o XNRM is the infinity-norm of the solution
o ANRM is the infinity-operator-norm of the matrix A
o EPS is the machine epsilon returned by DLAMCH('Epsilon')
The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
Arguments
=========
UPLO (input) CHARACTER
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The number of linear equations, i.e., the order of the
matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input or input/output) DOUBLE PRECISION array, dimension (LDA,N)
On entry, the symmetric matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if iterative refinement has been successfully used
(INFO.EQ.0 and ITER.GE.0, see description below), then A is
unchanged, if double factorization has been used
(INFO.EQ.0 and ITER.LT.0, see description below), then the
array A contains the factor U or L from the Cholesky
factorization A = U**T*U or A = L*L**T.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
B (input) DOUBLE PRECISION array, dimension (LDB,NRHS)
The N-by-NRHS right hand side matrix B.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
X (output) DOUBLE PRECISION array, dimension (LDX,NRHS)
If INFO = 0, the N-by-NRHS solution matrix X.
LDX (input) INTEGER
The leading dimension of the array X. LDX >= max(1,N).
dworkd (workspace) DOUBLE PRECISION array, dimension (N*NRHS)
This array is used to hold the residual vectors.
dworks (workspace) SINGLE PRECISION array, dimension (N*(N+NRHS))
This array is used to use the real single precision matrix
and the right-hand sides or solutions in single precision.
ITER (output) INTEGER
< 0: iterative refinement has failed, double precision
factorization has been performed
-1 : the routine fell back to full precision for
implementation- or machine-specific reasons
-2 : narrowing the precision induced an overflow,
the routine fell back to full precision
-3 : failure of SPOTRF
-31: stop the iterative refinement after the 30th
iterations
> 0: iterative refinement has been successfully used.
Returns the number of iterations
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i, the leading minor of order i of (DOUBLE
PRECISION) A is not positive definite, so the
factorization could not be completed, and the solution
has not been computed.
===================================================================== */
double c_neg_one = MAGMA_D_NEG_ONE;
double c_one = MAGMA_D_ONE;
magma_int_t ione = 1;
float *dSA, *dSX;
double Xnrmv, Rnrmv;
double Xnrm, Rnrm, Anrm, cte, eps;
magma_int_t i, j, iiter;
*iter = 0 ;
*info = 0 ;
if ( n <0)
*info = -1;
else if( nrhs<0 )
*info =-2;
else if( ldda < max(1,n) )
*info =-4;
else if( lddb < max(1,n) )
*info =-7;
else if( lddx < max(1,n) )
*info =-9;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
if( n == 0 || nrhs == 0 )
return *info;
eps = lapackf77_dlamch("Epsilon");
//ANRM = magmablas_dlansy( 'I', uplo , N ,A, LDA, (double*)dworkd);
//cte = ANRM * EPS * pow((double)N,0.5) * BWDMAX ;
dSX = dworks;
dSA = dworks + n*nrhs;
magmablas_dlag2s(n, nrhs, dB, lddb, dSX, n, info );
if( *info !=0 ){
*iter = -2;
goto L40;
}
magmablas_dlat2s(uplo, n, dA, ldda, dSA, n, info );
if( *info !=0 ){
*iter = -2 ;
goto L40;
}
Anrm = magmablas_slansy( 'I', uplo, n, dSA, n, (float *)dworkd);
cte = Anrm * eps * pow((double)n,0.5) * BWDMAX ;
magma_spotrf_gpu(uplo, n, dSA, ldda, info);
if( *info !=0 ){
*iter = -3 ;
goto L40;
}
magma_spotrs_gpu(uplo, n, nrhs, dSA, ldda, dSX, lddb, info);
magmablas_slag2d(n, nrhs, dSX, n, dX, lddx, info);
magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dworkd, n);
if( nrhs == 1 )
magma_dsymv(uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dworkd, 1);
else
magma_dsymm(MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dworkd, n);
for(i=0; i<nrhs; i++){
j = magma_idamax( n, dX+i*n, 1) ;
magma_dgetmatrix( 1, 1, dX+i*n+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( n, dworkd+i*n, 1 );
magma_dgetmatrix( 1, 1, dworkd+i*n+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > Xnrm *cte ){
goto L10;
}
}
*iter =0;
return *info;
L10:
;
for(iiter=1;iiter<ITERMAX;){
*info = 0 ;
magmablas_dlag2s(n, nrhs, dworkd, n, dSX, n, info );
if(*info !=0){
*iter = -2 ;
goto L40;
}
magma_spotrs_gpu(uplo, n, nrhs, dSA, ldda, dSX, lddb, info);
for(i=0;i<nrhs;i++){
magmablas_dsaxpycp(dworks+i*n, dX+i*n, n, dB+i*n,dworkd+i*n) ;
}
if( nrhs == 1 )
magma_dsymv(uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dworkd, 1);
else
magma_dsymm(MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dworkd, n);
for(i=0; i<nrhs; i++){
j = magma_idamax( n, dX+i*n, 1) ;
magma_dgetmatrix( 1, 1, dX+i*n+j-1, 1, &Xnrmv, 1 );
Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );
j = magma_idamax ( n, dworkd+i*n, 1 );
magma_dgetmatrix( 1, 1, dworkd+i*n+j-1, 1, &Rnrmv, 1 );
Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );
if( Rnrm > Xnrm *cte ){
goto L20;
}
}
*iter = iiter;
return *info;
L20:
iiter++ ;
}
*iter = -ITERMAX - 1;
L40:
magma_dpotrf_gpu( uplo, n, dA, ldda, info );
if( *info == 0 ){
magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx );
magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info );
}
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function: