MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zhegvd_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for zhegvd_m.cpp:

Go to the source code of this file.

Macros

#define ENABLE_TIMER

Functions

magma_int_t magma_zpotrf2_ooc (magma_int_t num_gpus, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, magma_int_t *info)
magma_int_t magma_zhegst_m (magma_int_t nrgpu, magma_int_t itype, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *b, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_zheevd_m (magma_int_t nrgpu, char jobz, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, double *w, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_ztrsm_m (magma_int_t nrgpu, char side, char uplo, char transa, char diag, magma_int_t m, magma_int_t n, cuDoubleComplex alpha, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *b, magma_int_t ldb)
magma_int_t magma_zhegvd_m (magma_int_t nrgpu, magma_int_t itype, char jobz, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *b, magma_int_t ldb, double *w, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)

Macro Definition Documentation

#define ENABLE_TIMER

Function Documentation

magma_int_t magma_zheevd_m ( magma_int_t  nrgpu,
char  jobz,
char  uplo,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
double *  w,
cuDoubleComplex *  work,
magma_int_t  lwork,
double *  rwork,
magma_int_t  lrwork,
magma_int_t iwork,
magma_int_t  liwork,
magma_int_t info 
)

Definition at line 53 of file zheevd_m.cpp.

References __func__, blasf77_dscal(), dwork, get_current_time(), GetTimerValue(), lapackf77_dlamch, lapackf77_dsterf, lapackf77_lsame, lapackf77_zlacpy, lapackf77_zlanhe, lapackf77_zlascl, magma_dmalloc(), magma_dsqrt, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_zhetrd_nb(), MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_MAKE, MAGMA_Z_ONE, MAGMA_Z_REAL, MAGMA_Z_SET2REAL, magma_zhetrd_mgpu(), magma_zstedx(), magma_zstedx_m(), magma_zunmtr_m(), MagmaLeft, MagmaLowerStr, MagmaNoTrans, MagmaNoVectorsStr, MagmaUpperStr, MagmaVectorsStr, max, and uplo.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a
complex Hermitian matrix A. If eigenvectors are desired, it uses a
divide and conquer algorithm.
The divide and conquer algorithm makes very mild assumptions about
floating point arithmetic. It will work on machines with a guard
digit in add/subtract, or on those binary machines without guard
digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
Cray-2. It could conceivably fail on hexadecimal or decimal machines
without guard digits, but we know of none.
Arguments
=========
JOBZ (input) CHARACTER*1
= 'N': Compute eigenvalues only;
= 'V': Compute eigenvalues and eigenvectors.
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA, N)
On entry, the Hermitian matrix A. If UPLO = 'U', the
leading N-by-N upper triangular part of A contains the
upper triangular part of the matrix A. If UPLO = 'L',
the leading N-by-N lower triangular part of A contains
the lower triangular part of the matrix A.
On exit, if JOBZ = 'V', then if INFO = 0, A contains the
orthonormal eigenvectors of the matrix A.
If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
or the upper triangle (if UPLO='U') of A, including the
diagonal, is destroyed.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
W (output) DOUBLE PRECISION array, dimension (N)
If INFO = 0, the eigenvalues in ascending order.
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK.
If N <= 1, LWORK must be at least 1.
If JOBZ = 'N' and N > 1, LWORK must be at least N * (NB + 1).
If JOBZ = 'V' and N > 1, LWORK must be at least 2*N + N**2.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal sizes of the WORK, RWORK and
IWORK arrays, returns these values as the first entries of
the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
RWORK (workspace/output) DOUBLE PRECISION array,
dimension (LRWORK)
On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
LRWORK (input) INTEGER
The dimension of the array RWORK.
If N <= 1, LRWORK must be at least 1.
If JOBZ = 'N' and N > 1, LRWORK must be at least N.
If JOBZ = 'V' and N > 1, LRWORK must be at least
1 + 5*N + 2*N**2.
If LRWORK = -1, then a workspace query is assumed; the
routine only calculates the optimal sizes of the WORK, RWORK
and IWORK arrays, returns these values as the first entries
of the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
LIWORK (input) INTEGER
The dimension of the array IWORK.
If N <= 1, LIWORK must be at least 1.
If JOBZ = 'N' and N > 1, LIWORK must be at least 1.
If JOBZ = 'V' and N > 1, LIWORK must be at least 3 + 5*N.
If LIWORK = -1, then a workspace query is assumed; the
routine only calculates the optimal sizes of the WORK, RWORK
and IWORK arrays, returns these values as the first entries
of the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i and JOBZ = 'N', then the algorithm failed
to converge; i off-diagonal elements of an intermediate
tridiagonal form did not converge to zero;
if INFO = i and JOBZ = 'V', then the algorithm failed
to compute an eigenvalue while working on the submatrix
lying in rows and columns INFO/(N+1) through
mod(INFO,N+1).
Further Details
===============
Based on contributions by
Jeff Rutter, Computer Science Division, University of California
at Berkeley, USA
Modified description of INFO. Sven, 16 Feb 05.
===================================================================== */
char uplo_[2] = {uplo, 0};
char jobz_[2] = {jobz, 0};
magma_int_t c__1 = 1;
magma_int_t c_n1 = -1;
magma_int_t c__0 = 0;
double c_b18 = 1.;
double d__1;
double eps;
double anrm;
double rmin, rmax;
double sigma;
magma_int_t iinfo, lwmin;
magma_int_t lower;
magma_int_t llrwk;
magma_int_t wantz;
magma_int_t indwk2, llwrk2;
magma_int_t iscale;
double safmin;
double bignum;
magma_int_t indtau;
magma_int_t indrwk, indwrk, liwmin;
magma_int_t lrwmin, llwork;
double smlnum;
magma_int_t lquery;
double* dwork;
lower = lapackf77_lsame(uplo_, MagmaLowerStr);
lquery = lwork == -1 || lrwork == -1 || liwork == -1;
*info = 0;
if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVectorsStr))) {
*info = -1;
} else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
*info = -2;
} else if (n < 0) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
}
if (wantz) {
lwmin = 2 * n + n * n;
lrwmin = 1 + 5 * n + 2 * n * n;
liwmin = 5 * n + 3;
} else {
lwmin = n * (nb + 1);
lrwmin = n;
liwmin = 1;
}
MAGMA_Z_SET2REAL(work[0],(double)lwmin);
rwork[0] = lrwmin;
iwork[0] = liwmin;
if ((lwork < lwmin) && !lquery) {
*info = -8;
} else if ((lrwork < lrwmin) && ! lquery) {
*info = -10;
} else if ((liwork < liwmin) && ! lquery) {
*info = -12;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (n == 0) {
return *info;
}
if (n == 1) {
w[0] = MAGMA_Z_REAL(a[0]);
if (wantz) {
a[0] = MAGMA_Z_ONE;
}
return *info;
}
/* Get machine constants. */
safmin = lapackf77_dlamch("Safe minimum");
eps = lapackf77_dlamch("Precision");
smlnum = safmin / eps;
bignum = 1. / smlnum;
rmin = magma_dsqrt(smlnum);
rmax = magma_dsqrt(bignum);
/* Scale matrix to allowable range, if necessary. */
anrm = lapackf77_zlanhe("M", uplo_, &n, a, &lda, rwork);
iscale = 0;
if (anrm > 0. && anrm < rmin) {
iscale = 1;
sigma = rmin / anrm;
} else if (anrm > rmax) {
iscale = 1;
sigma = rmax / anrm;
}
if (iscale == 1) {
lapackf77_zlascl(uplo_, &c__0, &c__0, &c_b18, &sigma, &n, &n, a,
&lda, info);
}
/* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */
inde = 0;
indtau = 0;
indwrk = indtau + n;
indrwk = inde + n;
indwk2 = indwrk + n * n;
llwork = lwork - indwrk;
llwrk2 = lwork - indwk2;
llrwk = lrwork - indrwk;
#define ENABLE_TIMER
#ifdef ENABLE_TIMER
magma_timestr_t start, end;
start = get_current_time();
#endif
//magma_zhetrd(uplo_[0], n, a, lda, w, &rwork[inde],
// &work[indtau], &work[indwrk], llwork, &iinfo);
magma_zhetrd_mgpu(nrgpu, 4, uplo_[0], n, a, lda, w, &rwork[inde],
&work[indtau], &work[indwrk], llwork, &iinfo);
#ifdef ENABLE_TIMER
printf("time zhetrd = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
/* For eigenvalues only, call DSTERF. For eigenvectors, first call
ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
tridiagonal matrix, then call ZUNMTR to multiply it to the Householder
transformations represented as Householder vectors in A. */
if (! wantz) {
lapackf77_dsterf(&n, w, &rwork[inde], info);
} else {
//lapackf77_zstedc("I", &n, w, &rwork[inde], &work[indwrk], &n, &work[indwk2],
// &llwrk2, &rwork[indrwk], &llrwk, iwork, &liwork, info);
#ifdef ENABLE_TIMER
start = get_current_time();
#endif
#ifdef USE_SINGLE_GPU
if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) {
return *info;
}
/* double vl = 0;
double vu = 0;
magma_int_t il = 0;
magma_int_t iu = 0;
char range_[2] = {'A', 0};
magma_zstedx_(range_, &n, &vl, &vu, &il, &iu, w, &rwork[inde],
&work[indwrk], &n, &rwork[indrwk],
&llrwk, iwork, &liwork, dwork, info);
*/
magma_zstedx('A', n, 0, 0, 0, 0, w, &rwork[inde],
&work[indwrk], n, &rwork[indrwk],
llrwk, iwork, liwork, dwork, info);
magma_free( dwork );
#else
magma_zstedx_m(nrgpu, 'A', n, 0, 0, 0, 0, w, &rwork[inde],
&work[indwrk], n, &rwork[indrwk],
llrwk, iwork, liwork, info);
#endif
#ifdef ENABLE_TIMER
printf("time zstedc = %6.2f\n", GetTimerValue(start,end)/1000.);
start = get_current_time();
#endif
//magma_zunmtr(MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau],
magma_zunmtr_m(nrgpu, MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau],
&work[indwrk], n, &work[indwk2], llwrk2, &iinfo);
lapackf77_zlacpy("A", &n, &n, &work[indwrk], &n, a, &lda);
#ifdef ENABLE_TIMER
printf("time zunmtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
}
/* If matrix was scaled, then rescale eigenvalues appropriately. */
if (iscale == 1) {
if (*info == 0) {
imax = n;
} else {
imax = *info - 1;
}
d__1 = 1. / sigma;
blasf77_dscal(&imax, &d__1, w, &c__1);
}
work[0] = MAGMA_Z_MAKE((double) lwmin, 0.);
rwork[0] = (double) lrwmin;
iwork[0] = liwmin;
return *info;
} /* magma_zheevd_m */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zhegst_m ( magma_int_t  nrgpu,
magma_int_t  itype,
char  uplo,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  b,
magma_int_t  ldb,
magma_int_t info 
)

Definition at line 28 of file zhegst_m.cpp.

References __func__, A, B, dA, dB_c, dB_r, lapackf77_lsame, lapackf77_zhegs2, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_zhegst_m_nb(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_HALF, MAGMA_Z_NEG_HALF, MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, magma_zgemm(), magma_zgetmatrix(), magma_zgetmatrix_async(), magma_zhemm(), magma_zher2k(), magma_zmalloc(), magma_zsetmatrix_async(), magma_ztrsm(), magmablasSetKernelStream(), MagmaConjTrans, MagmaLeft, MagmaLower, MagmaNonUnit, MagmaNoTrans, MagmaRight, MagmaUpper, max, min, N_MAX_GPU, and uplo.

{
/*
-- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZHEGST reduces a complex Hermitian-definite generalized
eigenproblem to standard form.
If ITYPE = 1, the problem is A*x = lambda*B*x,
and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
B must have been previously factorized as U**H*U or L*L**H by ZPOTRF.
Arguments
=========
ITYPE (input) INTEGER
= 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H);
= 2 or 3: compute U*A*U**H or L**H*A*L.
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored and B is factored as
U**H*U;
= 'L': Lower triangle of A is stored and B is factored as
L*L**H.
N (input) INTEGER
The order of the matrices A and B. N >= 0.
A (input/output) COMPLEX*16 array, dimension (LDA,N)
On entry, the Hermitian matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if INFO = 0, the transformed matrix, stored in the
same format as A.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
B (input) COMPLEX*16 array, dimension (LDB,N)
The triangular factor from the Cholesky factorization of B,
as returned by ZPOTRF.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
=====================================================================*/
char uplo_[2] = {uplo, 0};
magma_int_t k, kb, j, jb, kb2;
magma_int_t ldda, dima, lddbr, lddbc;
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
cuDoubleComplex c_half = MAGMA_Z_HALF;
cuDoubleComplex c_neg_half = MAGMA_Z_NEG_HALF;
cuDoubleComplex* dw[N_MAX_GPU];
cudaStream_t stream [N_MAX_GPU][3];
magma_int_t igpu = 0;
int gpu_b;
magma_getdevice(&gpu_b);
double d_one = 1.0;
long int upper = lapackf77_lsame(uplo_, "U");
/* Test the input parameters. */
*info = 0;
if (itype<1 || itype>3){
*info = -1;
}else if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
*info = -2;
} else if (n < 0) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
}else if (ldb < max(1,n)) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return */
if ( n == 0 )
return *info;
magma_int_t nbl = (n-1)/nb+1; // number of blocks
if ( (itype==1 && upper) || (itype!=1 && !upper) ){
ldda = ((nbl-1)/nrgpu+1)*nb;
dima = n;
} else {
ldda = n;
dima = ((nbl-1)/nrgpu+1)*nb;
}
lddbr = 2 * nb;
lddbc = n;
for (igpu = 0; igpu < nrgpu; ++igpu){
if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (dima*ldda + lddbc*lddbr) )) {
return *info;
}
magma_queue_create( &stream[igpu][0] );
magma_queue_create( &stream[igpu][1] );
magma_queue_create( &stream[igpu][2] );
}
/* Use hybrid blocked code */
if (itype==1) {
if (upper) {
/* Compute inv(U')*A*inv(U) */
//copy A to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
A(k, k), lda,
dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
}
kb= min(n,nb);
igpu = 0;
// dB_r(0,0) is used to store B(k,k)
B(0, 0), ldb,
dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
for(k = 0; k<nbl; ++k){
kb= min(n-k*nb,nb);
kb2= min(n-(k+1)*nb,nb);
if(k+1<nbl){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_zsetmatrix_async( kb, n-(k+1)*nb,
B(k, k+1), ldb,
dB_r(igpu, 0, k+1), lddbr, stream[igpu][0] );
}
}
igpu = k%nrgpu;
magma_queue_sync( stream[igpu][1] ); // Needed, otherwise conflicts reading B(k,k) between hegs2 and cudaMemcpy2D
magma_queue_sync( stream[igpu][2] );
if(k+1<nbl){
magmablasSetKernelStream(stream[igpu][1]);
// dB_r(0,0) stores B(k,k)
kb, n-(k+1)*nb,
c_one, dB_r(igpu, 0, 0), lddbr,
dA(igpu, k/nrgpu, k+1), ldda);
}
lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
printf("hegs2%d\n", k);
if (k+1<nbl) {
A(k, k), lda,
dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
magmablasSetKernelStream(stream[igpu][0]);
kb, n-(k+1)*nb,
c_neg_half, dA(igpu, k/nrgpu, k), ldda,
dB_r(igpu, 0, k+1), lddbr,
c_one, dA(igpu, k/nrgpu, k+1), ldda);
magma_queue_sync( stream[igpu][0] );
magma_zgetmatrix( kb, n-(k+1)*nb,
dA(igpu, k/nrgpu, k+1), ldda,
A(k, k+1), lda );
// send the partially updated panel of dA to each gpu in the second dB block
// to overlap hemm computation
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( kb, n-(k+1)*nb,
A(k, k+1), lda,
dB_r(igpu, 1, k+1), lddbr, stream[igpu][0] );
}
igpu = k%nrgpu;
magmablasSetKernelStream(stream[igpu][1]);
kb, n-(k+1)*nb,
c_neg_half, dA(igpu, k/nrgpu, k), ldda,
dB_r(igpu, 0, k+1), lddbr,
c_one, dA(igpu, k/nrgpu, k+1), ldda);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
}
for (j = k+1; j < nbl; ++j){
jb = min(nb, n-j*nb);
igpu = j%nrgpu;
magmablasSetKernelStream(stream[igpu][(j/nrgpu)%3]);
jb, nb,
c_neg_one, dB_r(igpu, 1, j), lddbr,
dB_r(igpu, 0, j), lddbr,
d_one, dA(igpu, j/nrgpu, j), ldda);
magma_queue_sync( stream[igpu][((j)/nrgpu)%3] ); // Needed for correctness. Why?
if (j == k+1){
magma_queue_sync( stream[igpu][(j/nrgpu)%3] );
dA(igpu, (k+1)/nrgpu, k+1), ldda,
A(k+1, k+1), lda, stream[igpu][2] );
// dB_r(0,0) is used to store B(k,k)
B(k+1, k+1), ldb,
dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
}
}
for (j = k+1; j < nbl-1; ++j){
igpu = j%nrgpu;
magmablasSetKernelStream(stream[igpu][0]);
magma_zgemm(MagmaConjTrans, MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one, dB_r(igpu, 0, j), lddbr,
dB_r(igpu, 1, j+1), lddbr, c_one, dA(igpu, j/nrgpu, j+1), ldda );
magma_zgemm(MagmaConjTrans, MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one, dB_r(igpu, 1, j), lddbr,
dB_r(igpu, 0, j+1), lddbr, c_one, dA(igpu, j/nrgpu, j+1), ldda );
}
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
}
if (n > nb){
jb = min(nb, n-nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
nloc[igpu]=0;
B(1, 1), ldb,
dB_r(igpu, 1, 1), lddbr, stream[igpu][1] );
}
for (j = 1; j < nbl; ++j){
if ((j+1)*nb < n){
jb = min(nb, n-(j+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( jb, n-(j+1)*nb,
B(j+1, j+1), ldb,
dB_r(igpu, (j+1)%2, j+1), lddbr, stream[igpu][(j+1)%2] );
}
}
jb = min(nb, n-j*nb);
nloc[(j-1)%nrgpu] += nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(MagmaRight, uplo, MagmaNoTrans, MagmaNonUnit, nloc[igpu], jb, c_one, dB_r(igpu, j%2, j), lddbr,
dA(igpu, 0, j), ldda );
}
if ( j < nbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(MagmaNoTrans, MagmaNoTrans, nloc[igpu], n-(j+1)*nb, nb, c_neg_one, dA(igpu, 0, j), ldda,
dB_r(igpu, j%2, j+1), lddbr, c_one, dA(igpu, 0, j+1), ldda );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < j; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dA(igpu, k/nrgpu, j), ldda,
A(k, j), lda, stream[igpu][2] );
}
}
}
} else {
/* Compute inv(L)*A*inv(L') */
//copy A to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
magma_zsetmatrix_async( (n-k*nb), kb,
A(k, k), lda,
dA(igpu, k, k/nrgpu), ldda, stream[igpu][0] );
}
kb= min(n,nb);
igpu = 0;
// dB_c(0,0) is used to store B(k,k)
B(0, 0), ldb,
dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
for(k = 0; k<nbl; ++k){
kb= min(n-k*nb,nb);
kb2= min(n-(k+1)*nb,nb);
if(k+1<nbl){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_zsetmatrix_async( (n-(k+1)*nb), kb,
B(k+1, k), ldb,
dB_c(igpu, k+1, 0), lddbc, stream[igpu][0] );
}
}
igpu = k%nrgpu;
magma_queue_sync( stream[igpu][1] ); // Needed, otherwise conflicts reading B(k,k) between hegs2 and cudaMemcpy2D
magma_queue_sync( stream[igpu][2] );
if(k+1<nbl){
magmablasSetKernelStream(stream[igpu][1]);
// dB_c(0,0) stores B(k,k)
n-(k+1)*nb, kb,
c_one, dB_c(igpu, 0, 0), lddbc,
dA(igpu, k+1, k/nrgpu), ldda);
}
lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
if (k+1<nbl) {
A(k, k), lda,
dA(igpu, k , k/nrgpu), ldda, stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
magmablasSetKernelStream(stream[igpu][0]);
n-(k+1)*nb, kb,
c_neg_half, dA(igpu, k, k/nrgpu), ldda,
dB_c(igpu, k+1, 0), lddbc,
c_one, dA(igpu, k+1, k/nrgpu), ldda);
magma_queue_sync( stream[igpu][0] );
magma_zgetmatrix( n-(k+1)*nb, kb,
dA(igpu, k+1, k/nrgpu), ldda,
A(k+1, k), lda );
// send the partially updated panel of dA to each gpu in the second dB block
// to overlap hemm computation
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (n-(k+1)*nb), kb,
A(k+1, k), lda,
dB_c(igpu, k+1, 1), lddbc, stream[igpu][0] );
}
igpu = k%nrgpu;
magmablasSetKernelStream(stream[igpu][1]);
n-(k+1)*nb, kb,
c_neg_half, dA(igpu, k, k/nrgpu), ldda,
dB_c(igpu, k+1, 0), lddbc,
c_one, dA(igpu, k+1, k/nrgpu), ldda);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
}
for (j = k+1; j < nbl; ++j){
jb = min(nb, n-j*nb);
igpu = j%nrgpu;
magmablasSetKernelStream(stream[igpu][(j/nrgpu)%3]);
jb, nb,
c_neg_one, dB_c(igpu, j, 1), lddbc,
dB_c(igpu, j, 0), lddbc,
d_one, dA(igpu, j, j/nrgpu), ldda);
magma_queue_sync( stream[igpu][((j)/nrgpu)%3] ); // Needed for correctness. Why?
if (j == k+1){
magma_queue_sync( stream[igpu][(j/nrgpu)%3] );
dA(igpu, k+1, (k+1)/nrgpu), ldda,
A(k+1, k+1), lda, stream[igpu][2] );
// dB_c(0,0) is used to store B(k,k)
B(k+1, k+1), ldb,
dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
}
}
for (j = k+1; j < nbl-1; ++j){
igpu = j%nrgpu;
magmablasSetKernelStream(stream[igpu][0]);
magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one, dB_c(igpu, j+1, 1), lddbc,
dB_c(igpu, j, 0), lddbc, c_one, dA(igpu, j+1, j/nrgpu), ldda );
magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one, dB_c(igpu, j+1, 0), lddbc,
dB_c(igpu, j, 1), lddbc, c_one, dA(igpu, j+1, j/nrgpu), ldda );
}
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
}
if (n > nb){
jb = min(nb, n-nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
nloc[igpu]=0;
B(1, 1), ldb,
dB_c(igpu, 1, 1), lddbc, stream[igpu][1] );
}
for (j = 1; j < nbl; ++j){
if ((j+1)*nb < n){
jb = min(nb, n-(j+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (n-(j+1)*nb), jb,
B(j+1, j+1), ldb,
dB_c(igpu, j+1, (j+1)%2), lddbc, stream[igpu][(j+1)%2] );
}
}
jb = min(nb, n-j*nb);
nloc[(j-1)%nrgpu] += nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(MagmaLeft, uplo, MagmaNoTrans, MagmaNonUnit, jb, nloc[igpu], c_one, dB_c(igpu, j, j%2), lddbc,
dA(igpu, j, 0), ldda );
}
if ( j < nbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, nloc[igpu], nb, c_neg_one, dB_c(igpu, j+1, j%2), lddbc,
dA(igpu, j, 0), ldda, c_one, dA(igpu, j+1, 0), ldda );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < j; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dA(igpu, j, k/nrgpu), ldda,
A(j, k), lda, stream[igpu][2] );
}
}
}
}
} else {
if (upper) {
printf("zhegst_m: type2 upper not implemented\n");
exit(-1);
/* Compute U*A*U' */
/* for(k = 0; k<n; k+=nb){
kb= min(n-k,nb);
magma_zgetmatrix_async( kb, kb,
dA(k, k), ldda,
A(k, k), lda, stream[0] );
// Update the upper triangle of A(1:k+kb-1,1:k+kb-1)
if(k>0){
magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
k, kb,
c_one ,dB(0,0), lddb,
dA(0,k), ldda);
magma_zhemm(MagmaRight, MagmaUpper,
k, kb,
c_half, dA(k,k), ldda,
dB(0,k), lddb,
c_one, dA(0, k), ldda);
magma_queue_sync( stream[1] );
magma_zher2k(MagmaUpper, MagmaNoTrans,
k, kb,
c_one, dA(0,k), ldda,
dB(0,k), lddb,
d_one, dA(0,0), ldda);
magma_zhemm(MagmaRight, MagmaUpper,
k, kb,
c_half, dA(k,k), ldda,
dB(0,k), lddb,
c_one, dA(0, k), ldda);
magma_ztrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
k, kb,
c_one, dB(k,k), lddb,
dA(0,k), ldda);
}
magma_queue_sync( stream[0] );
lapackf77_zhegs2( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info);
magma_zsetmatrix_async( kb, kb,
A(k, k), lda,
dA(k, k), ldda, stream[1] );
}
magma_queue_sync( stream[1] );
*/
} else {
/* Compute L'*A*L */
printf("zhegst_m: type2 lower not implemented\n");
exit(-1);
/* if (n > nb){
magma_int_t nloc[N_MAX_GPU];
for(igpu = 0; igpu < nrgpu; ++igpu)
nloc[igpu] = 0;
kb = min(nb, n);
for (j = 0; j < nbl; ++j){
igpu = j%nrgpu;
magma_setdevice(igpu);
jb = min(nb, n-j*nb);
nloc[igpu] += jb;
magma_zsetmatrix_async( jb, kb,
A(j, 0), lda,
dA(igpu, j/nrgpu, 0), ldda, stream[igpu][0] );
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_setdevice(igpu);
magma_zsetmatrix_async( kb, kb,
B(0, 0), ldb,
dB_r(igpu, 0, 0), lddbr, stream[igpu][0] );
}
for (k = 0; k < nbl-1; ++k){
nloc[k%nrgpu] -= nb;
if (k < nbl-2){
kb = min(nb, n-(k+1)*nb);
for (j = k; j < nbl; ++j){
igpu = j%nrgpu;
magma_setdevice(igpu);
jb = min(nb, n-j*nb);
magma_zsetmatrix_async( jb, kb,
A(j, k+1), lda,
dA(igpu, j/nrgpu, k+1), ldda, stream[igpu][(k+1)%2] );
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_setdevice(igpu);
magma_zsetmatrix_async( kb, (k+1)*nb + kb,
B(k+1, 0), ldb,
dB_r(igpu, (k+1)%2, 0), lddbr, stream[igpu][(k+1)%2] );
}
}
kb = min(nb, n-k*nb);
if (k > 0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_setdevice(igpu);
magmablasSetKernelStream(stream[igpu][k%2]);
magma_zgemm(MagmaNoTrans, MagmaNoTrans, nloc[igpu], k*nb, kb, c_one, dA(igpu, n-nloc[igpu], k), ldda,
dB_r(igpu, k%2, 0), lddbr, c_one, dA(igpu, n-nloc[igpu], 0), ldda );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_setdevice(igpu);
magmablasSetKernelStream(stream[igpu][k%2]);
magma_ztrmm(MagmaRight, uplo, MagmaNoTrans, MagmaNonUnit, nloc[igpu], kb, c_one, dB_r(igpu, k%2, k), lddbr,
dA(igpu, n-nloc[igpu], k), ldda );
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][k%2] );
}
}
}
// put for loop!
// put copies!
magmablasSetKernelStream(stream[igpu][0]);
magma_zhemm(MagmaRight, MagmaLower,
kb, k*nb,
c_half, dA(igpu, k/nrgpu, 0), ldda,
dB_r(igpu, 0, 0), lddbr,
c_one, dA(igpu, k/nrgpu, 0), ldda);
magma_queue_sync( stream[igpu][0] );
magma_zgetmatrix( kb, k*nb,
dA(igpu, k/nrgpu, 0), ldda,
A(k, 0), lda );
// send the partially updated panel of dA to each gpu in the second dB block
// to overlap hemm computation
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_setdevice(igpu);
magma_zsetmatrix_async( kb, // ERROR: missing dimension,
A(k, 0), lda,
dB_r(igpu, 1, 0), lddbr, stream[igpu][0] );
}
igpu = k%nrgpu;
magma_setdevice(igpu);
magmablasSetKernelStream(stream[igpu][1]);
magma_zhemm(MagmaRight, MagmaLower,
n-(k+1)*nb, kb,
c_neg_half, dA(igpu, k, k/nrgpu), ldda,
dB_c(igpu, k+1, 0), lddbc,
c_one, dA(igpu, k+1, k/nrgpu), ldda);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
}
//copy B from mgpus
for (j = 0; j < nbl; ++j){
igpu = j%nrgpu;
magma_setdevice(igpu);
jb = min(nb, n-j*nb);
magma_zgetmatrix_async( jb, n,
dA(igpu, j/nrgpu, 0), ldda,
A(j, 0), lda, stream[igpu][0] );
}
*//* for(k = 0; k<n; k+=nb){
kb= min(n-k,nb);
magma_zgetmatrix_async( kb, kb,
dA(k, k), ldda,
A(k, k), lda, stream[0] );
// Update the lower triangle of A(1:k+kb-1,1:k+kb-1)
if(k>0){
magma_ztrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit,
kb, k,
c_one ,dB(0,0), lddb,
dA(k,0), ldda);
magma_zhemm(MagmaLeft, MagmaLower,
kb, k,
c_half, dA(k,k), ldda,
dB(k,0), lddb,
c_one, dA(k, 0), ldda);
magma_queue_sync( stream[1] );
magma_zher2k(MagmaLower, MagmaConjTrans,
k, kb,
c_one, dA(k,0), ldda,
dB(k,0), lddb,
d_one, dA(0,0), ldda);
magma_zhemm(MagmaLeft, MagmaLower,
kb, k,
c_half, dA(k,k), ldda,
dB(k,0), lddb,
c_one, dA(k, 0), ldda);
magma_ztrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit,
kb, k,
c_one, dB(k,k), lddb,
dA(k,0), ldda);
}
magma_queue_sync( stream[0] );
lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
magma_zsetmatrix_async( kb, kb,
A(k, k), lda,
dA(k, k), ldda, stream[1] );
}
magma_queue_sync( stream[1] );
*/
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][2] );
magma_queue_destroy( stream[igpu][0] );
magma_queue_destroy( stream[igpu][1] );
magma_queue_destroy( stream[igpu][2] );
magma_free( dw[igpu] );
}
return *info;
} /* magma_zhegst_gpu */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zhegvd_m ( magma_int_t  nrgpu,
magma_int_t  itype,
char  jobz,
char  uplo,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  b,
magma_int_t  ldb,
double *  w,
cuDoubleComplex *  work,
magma_int_t  lwork,
double *  rwork,
magma_int_t  lrwork,
magma_int_t iwork,
magma_int_t  liwork,
magma_int_t info 
)

Definition at line 41 of file zhegvd_m.cpp.

References __func__, get_current_time(), GetTimerValue(), lapackf77_lsame, magma_get_zhetrd_nb(), magma_xerbla(), MAGMA_Z_ONE, MAGMA_Z_SET2REAL, magma_zheevd_m(), magma_zhegst_m(), magma_zpotrf2_ooc(), magma_ztrsm_m(), MagmaConjTrans, MagmaLeft, MagmaLowerStr, MagmaNonUnit, MagmaNoTrans, MagmaNoVectorsStr, MagmaUpperStr, MagmaVectorsStr, max, trans, and uplo.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZHEGVD computes all the eigenvalues, and optionally, the eigenvectors
of a complex generalized Hermitian-definite eigenproblem, of the form
A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and
B are assumed to be Hermitian and B is also positive definite.
If eigenvectors are desired, it uses a divide and conquer algorithm.
The divide and conquer algorithm makes very mild assumptions about
floating point arithmetic. It will work on machines with a guard
digit in add/subtract, or on those binary machines without guard
digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
Cray-2. It could conceivably fail on hexadecimal or decimal machines
without guard digits, but we know of none.
Arguments
=========
ITYPE (input) INTEGER
Specifies the problem type to be solved:
= 1: A*x = (lambda)*B*x
= 2: A*B*x = (lambda)*x
= 3: B*A*x = (lambda)*x
JOBZ (input) CHARACTER*1
= 'N': Compute eigenvalues only;
= 'V': Compute eigenvalues and eigenvectors.
UPLO (input) CHARACTER*1
= 'U': Upper triangles of A and B are stored;
= 'L': Lower triangles of A and B are stored.
N (input) INTEGER
The order of the matrices A and B. N >= 0.
A (input/output) COMPLEX*16 array, dimension (LDA, N)
On entry, the Hermitian matrix A. If UPLO = 'U', the
leading N-by-N upper triangular part of A contains the
upper triangular part of the matrix A. If UPLO = 'L',
the leading N-by-N lower triangular part of A contains
the lower triangular part of the matrix A.
On exit, if JOBZ = 'V', then if INFO = 0, A contains the
matrix Z of eigenvectors. The eigenvectors are normalized
as follows:
if ITYPE = 1 or 2, Z**H*B*Z = I;
if ITYPE = 3, Z**H*inv(B)*Z = I.
If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
or the lower triangle (if UPLO='L') of A, including the
diagonal, is destroyed.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
B (input/output) COMPLEX*16 array, dimension (LDB, N)
On entry, the Hermitian matrix B. If UPLO = 'U', the
leading N-by-N upper triangular part of B contains the
upper triangular part of the matrix B. If UPLO = 'L',
the leading N-by-N lower triangular part of B contains
the lower triangular part of the matrix B.
On exit, if INFO <= N, the part of B containing the matrix is
overwritten by the triangular factor U or L from the Cholesky
factorization B = U**H*U or B = L*L**H.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
W (output) DOUBLE PRECISION array, dimension (N)
If INFO = 0, the eigenvalues in ascending order.
WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK.
If N <= 1, LWORK >= 1.
If JOBZ = 'N' and N > 1, LWORK >= N + 1.
If JOBZ = 'V' and N > 1, LWORK >= 2*N*nb + N**2.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal sizes of the WORK, RWORK and
IWORK arrays, returns these values as the first entries of
the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
RWORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
LRWORK (input) INTEGER
The dimension of the array RWORK.
If N <= 1, LRWORK >= 1.
If JOBZ = 'N' and N > 1, LRWORK >= N.
If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
If LRWORK = -1, then a workspace query is assumed; the
routine only calculates the optimal sizes of the WORK, RWORK
and IWORK arrays, returns these values as the first entries
of the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
LIWORK (input) INTEGER
The dimension of the array IWORK.
If N <= 1, LIWORK >= 1.
If JOBZ = 'N' and N > 1, LIWORK >= 1.
If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N.
If LIWORK = -1, then a workspace query is assumed; the
routine only calculates the optimal sizes of the WORK, RWORK
and IWORK arrays, returns these values as the first entries
of the WORK, RWORK and IWORK arrays, and no error message
related to LWORK or LRWORK or LIWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: ZPOTRF or ZHEEVD returned an error code:
<= N: if INFO = i and JOBZ = 'N', then the algorithm
failed to converge; i off-diagonal elements of an
intermediate tridiagonal form did not converge to
zero;
if INFO = i and JOBZ = 'V', then the algorithm
failed to compute an eigenvalue while working on
the submatrix lying in rows and columns INFO/(N+1)
through mod(INFO,N+1);
> N: if INFO = N + i, for 1 <= i <= N, then the leading
minor of order i of B is not positive definite.
The factorization of B could not be completed and
no eigenvalues or eigenvectors were computed.
Further Details
===============
Based on contributions by
Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA
Modified so that no backsubstitution is performed if ZHEEVD fails to
converge (NEIG in old code could be greater than N causing out of
bounds reference to A - reported by Ralf Meyer). Also corrected the
description of INFO and the test on ITYPE. Sven, 16 Feb 05.
===================================================================== */
char uplo_[2] = {uplo, 0};
char jobz_[2] = {jobz, 0};
cuDoubleComplex c_one = MAGMA_Z_ONE;
magma_int_t lower;
char trans[1];
magma_int_t wantz;
magma_int_t lquery;
// magma_int_t lopt;
magma_int_t lwmin;
// magma_int_t liopt;
magma_int_t liwmin;
// magma_int_t lropt;
magma_int_t lrwmin;
lower = lapackf77_lsame(uplo_, MagmaLowerStr);
lquery = lwork == -1 || lrwork == -1 || liwork == -1;
*info = 0;
if (itype < 1 || itype > 3) {
*info = -1;
} else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVectorsStr))) {
*info = -2;
} else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
*info = -3;
} else if (n < 0) {
*info = -4;
} else if (lda < max(1,n)) {
*info = -6;
} else if (ldb < max(1,n)) {
*info = -8;
}
if (wantz) {
lwmin = 2 * n + n * n;
lrwmin = 1 + 5 * n + 2 * n * n;
liwmin = 5 * n + 3;
} else {
lwmin = n * (nb + 1);
lrwmin = n;
liwmin = 1;
}
MAGMA_Z_SET2REAL(work[0],(double)lwmin);
rwork[0] = lrwmin;
iwork[0] = liwmin;
if (lwork < lwmin && ! lquery) {
*info = -11;
} else if (lrwork < lrwmin && ! lquery) {
*info = -13;
} else if (liwork < liwmin && ! lquery) {
*info = -15;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (n == 0) {
return *info;
}
#define ENABLE_TIMER
#ifdef ENABLE_TIMER
magma_timestr_t start, end;
start = get_current_time();
#endif
magma_zpotrf2_ooc(nrgpu, uplo_[0], n, b, ldb, info);
if (*info != 0) {
*info = n + *info;
return *info;
}
#ifdef ENABLE_TIMER
printf("time zpotrf = %6.2f\n", GetTimerValue(start,end)/1000.);
start = get_current_time();
#endif
/* Transform problem to standard eigenvalue problem and solve. */
magma_zhegst_m(nrgpu, itype, uplo_[0], n, a, lda, b, ldb, info);
#ifdef ENABLE_TIMER
printf("time zhegst = %6.2f\n", GetTimerValue(start,end)/1000.);
start = get_current_time();
#endif
magma_zheevd_m(nrgpu, jobz_[0], uplo_[0], n, a, lda, w, work, lwork, rwork, lrwork, iwork, liwork, info);
#ifdef ENABLE_TIMER
printf("time zheevd = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
if (wantz && *info == 0)
{
#ifdef ENABLE_TIMER
start = get_current_time();
#endif
/* Backtransform eigenvectors to the original problem. */
if (itype == 1 || itype == 2)
{
/* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
if (lower) {
*(unsigned char *)trans = MagmaConjTrans;
} else {
*(unsigned char *)trans = MagmaNoTrans;
}
magma_ztrsm_m(nrgpu, MagmaLeft, uplo_[0], *trans, MagmaNonUnit,
n, n, c_one, b, ldb, a, lda);
}
else if (itype == 3)
{
/* For B*A*x=(lambda)*x;
backtransform eigenvectors: x = L*y or U'*y */
if (lower) {
*(unsigned char *)trans = MagmaNoTrans;
} else {
*(unsigned char *)trans = MagmaConjTrans;
}
//magma_ztrmm(MagmaLeft, uplo_[0], *trans, MagmaNonUnit,
// n, n, c_one, db, lddb, da, ldda);
}
#ifdef ENABLE_TIMER
printf("time setmatrices trsm/mm + getmatrices = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
}
/*work[0].r = (doublereal) lopt, work[0].i = 0.;
rwork[0] = (doublereal) lropt;
iwork[0] = liopt;*/
return *info;
} /* magma_zhegvd_m */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zpotrf2_ooc ( magma_int_t  num_gpus,
char  uplo,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
magma_int_t info 
)

Definition at line 58 of file zpotrf2_ooc.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZPOTRF_OOC computes the Cholesky factorization of a complex Hermitian
positive definite matrix A. This version does not require work
space on the GPU passed as input. GPU memory is allocated in the
routine. The matrix A may not fit entirely in the GPU memory.
The factorization has the form
A = U**H * U, if UPLO = 'U', or
A = L * L**H, if UPLO = 'L',
where U is an upper triangular matrix and L is lower triangular.
This is the block version of the algorithm, calling Level 3 BLAS.
Arguments
=========
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the symmetric matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if INFO = 0, the factor U or L from the Cholesky
factorization A = U**H * U or A = L * L**H.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, the leading minor of order i is not
positive definite, and the factorization could not be
completed.
===================================================================== */
/* Local variables */
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
cuDoubleComplex *dwork[4], *dt[4], *work;
char uplo_[2] = {uplo, 0};
magma_int_t ldda, lddla, ldwrk, nb, iinfo, n_local[4], J2, d, num_gpus;
static magma_int_t j, jj, jb, jb1, jb2, jb3, J, JB, NB, MB;
double d_one = 1.0;
double d_neg_one = -1.0;
long int upper = lapackf77_lsame(uplo_, "U");
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
static cudaStream_t stream[4][3];
//#define ROW_MAJOR_PROFILE
#ifdef ROW_MAJOR_PROFILE
magma_timestr_t start, end, start0, end0;
double chol_time = 1.0;
#endif
*info = 0;
if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,n)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return */
if ( n == 0 )
return *info;
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
ldda = n/(nb*num_gpus);
if( n%(nb*num_gpus) != 0 ) ldda++;
ldda = num_gpus*((nb*ldda+31)/32)*32;
/* figure out NB */
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(cuDoubleComplex);
MB = n; /* number of rows in the big panel */
NB = (magma_int_t)(num_gpus*(0.8*totalMem/ldda-2*nb)); /* number of columns in the big panel */
if( NB >= n ) {
#ifdef CHECK_ZPOTRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_ZPOTRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = (NB / nb) * nb; /* making sure it's devisable by nb */
}
#ifdef CHECK_ZPOTRF_OOC
if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
fflush(stdout);
#endif
ldda = ((n+31)/32)*32;
lddla = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
for (d=0; d<num_gpus; d++ ) {
if (MAGMA_SUCCESS != magma_zmalloc( &dt[d], NB*lddla + 2*nb*ldda )) {
return *info;
}
dwork[d] = &dt[d][2*nb*ldda];
magma_queue_create( &stream[d][0] );
magma_queue_create( &stream[d][1] );
magma_queue_create( &stream[d][2] );
}
#ifdef ROW_MAJOR_PROFILE
start0 = get_current_time();
#endif
ldwrk = n;
if (MAGMA_SUCCESS != magma_zmalloc_host( &work, ldwrk*nb )) {
return *info;
}
if (nb <= 1 || nb >= n) {
lapackf77_zpotrf(uplo_, &n, a, &lda, info);
} else {
/* Use hybrid blocked code. */
if (upper) {
/* =========================================================== *
* Compute the Cholesky factorization A = U'*U. *
* big panel is divided by block-row and distributed in block *
* column cyclic format */
/* for each big-panel */
for( J=0; J<n; J+=NB ) {
JB = min(NB,n-J);
jb = min(JB,nb);
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
/* load the new big-panel by block-rows */
magma_zhtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
#ifdef ROW_MAJOR_PROFILE
start = get_current_time();
#endif
/* update with the previous big-panels */
for( j=0; j<J; j+=nb ) {
/* upload the diagonal of big panel */
for( d=0; d<num_gpus; d++ ) {
A(j, J), lda,
dTup(d, 0, J), nb, stream[d][0] );
n_local[d] = 0;
}
/* upload off-diagonals */
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
jb2 = min(nb, n-jj);
A(j, jj), lda,
dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] );
n_local[d] += jb2;
}
/* update the current big-panel using the previous block-row */
jb3 = nb; //min(nb,J-j); // number of columns in this previous block-column (nb)
for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb1 = min(JB,jj+nb); // first row in the next block-row
jb2 = min(nb,JB-jj); // number of rows in this current block-row
jb = jj; //jb1-jb2; // number of columns in the off-diagona blocks (jj)
jb, jb2, nb,
c_neg_one, dTup(d, 0, J ), nb,
dTup(d, 0, J+jb), nb,
c_one, dAup(d, 0, J2), NB);
d_neg_one, dTup(d, 0, J+jb), nb,
d_one, dAup(d, jb, J2), NB);
}
if( n > J+JB ) { /* off-diagonal */
for( d=0; d<num_gpus; d++ ) {
/* local number of columns in the big panel */
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
/* local number of columns in diagonal */
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
JB, n_local[d], nb,
c_neg_one, dTup(d, 0, J ), nb,
dTup(d, 0, J+JB), nb,
c_one, dAup(d, 0, J2), NB);
}
}
} /* end of updates with previous rows */
/* factor the big panel */
magma_zpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
chol_time += GetTimerValue(start, end);
#endif
/* upload the off-diagonal (and diagonal!!!) big panel */
magma_zdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
}
} else {
/* ========================================================= *
* Compute the Cholesky factorization A = L*L'. */
/* for each big-panel */
for( J=0; J<n; J+=NB ) {
JB = min(NB,n-J);
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
/* load the new big-panel by block-columns */
magma_zhtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
/* update with the previous big-panels */
#ifdef ROW_MAJOR_PROFILE
start = get_current_time();
#endif
for( j=0; j<J; j+=nb ) {
/* upload the diagonal of big panel */
for( d=0; d<num_gpus; d++ ) {
A(J, j), lda,
dT(d, J, 0), ldda, stream[d][0] );
n_local[d] = 0;
}
/* upload off-diagonals */
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
jb2 = min(nb, n-jj);
A(jj, j), lda,
dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] );
n_local[d] += jb2;
}
/* update the current big-panel using the previous block-row */
jb3 = nb; //min(nb,J-j);
for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb1 = min(JB,jj+nb);
jb2 = min(nb,JB-jj);
jb = jj; //jb1-jb2;
jb2, jb, nb,
c_neg_one, dT(d, J+jb, 0), ldda,
dT(d, J, 0), ldda,
c_one, dA(d, J2, 0), lddla);
d_neg_one, dT(d, J+jb, 0), ldda,
d_one, dA(d, J2, jb ), lddla);
}
if( n > J+JB ) { /* off-diagonal */
for( d=0; d<num_gpus; d++ ) {
/* local number of columns in the big panel */
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
/* local number of columns in diagonal */
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
n_local[d], JB, nb,
c_neg_one, dT(d, J+JB, 0), ldda,
dT(d, J, 0), ldda,
c_one, dA(d, J2, 0), lddla);
}
}
}
/* factor the big panel */
magma_zpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
chol_time += GetTimerValue(start, end);
#endif
/* upload the off-diagonal big panel */
//magma_zdtohpo( num_gpus, &uplo, n, JB, J, J, nb, NB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
magma_zdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
} /* end of for J */
} /* if upper */
} /* if nb */
#ifdef ROW_MAJOR_PROFILE
end0 = get_current_time();
#endif
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
for (d=0; d<num_gpus; d++ ) {
magma_free( dt[d] );
magma_queue_destroy( stream[d][0] );
magma_queue_destroy( stream[d][1] );
magma_queue_destroy( stream[d][2] );
}
magma_free_host( work );
#ifdef ROW_MAJOR_PROFILE
printf("\n n=%d NB=%d nb=%d\n",n,NB,nb);
printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS((double)n)/1000000, GetTimerValue(start0, end0),
FLOPS((double)n)/(1000000*GetTimerValue(start0, end0)));
printf(" Performance %f / %f = %f GFlop/s\n", FLOPS((double)n)/1000000, chol_time, FLOPS( (double)n ) / (1000000*chol_time));
#endif
return *info;
} /* magma_zpotrf_ooc */

Here is the caller graph for this function:

magma_int_t magma_ztrsm_m ( magma_int_t  nrgpu,
char  side,
char  uplo,
char  transa,
char  diag,
magma_int_t  m,
magma_int_t  n,
cuDoubleComplex  alpha,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  b,
magma_int_t  ldb 
)

Definition at line 27 of file ztrsm_m.cpp.

References __func__, A, B, dA, dB, diag, lapackf77_lsame, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_ztrsm_m_nb(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_IMAG, MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, MAGMA_Z_REAL, magma_zgemm(), magma_zgetmatrix_async(), magma_zmalloc(), magma_zsetmatrix_async(), magma_ztrsm(), magmablasSetKernelStream(), MagmaNoTrans, max, min, N_MAX_GPU, side, and uplo.

{
/* Purpose
=======
ZTRSM solves one of the matrix equations
op( A )*X = alpha*B, or X*op( A ) = alpha*B,
where alpha is a scalar, X and B are m by n matrices, A is a unit, or
non-unit, upper or lower triangular matrix and op( A ) is one of
op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ).
The matrix X is overwritten on B.
Parameters
==========
SIDE - CHARACTER*1.
On entry, SIDE specifies whether op( A ) appears on the left
or right of X as follows:
SIDE = 'L' or 'l' op( A )*X = alpha*B.
SIDE = 'R' or 'r' X*op( A ) = alpha*B.
Unchanged on exit.
UPLO - CHARACTER*1.
On entry, UPLO specifies whether the matrix A is an upper or
lower triangular matrix as follows:
UPLO = 'U' or 'u' A is an upper triangular matrix.
UPLO = 'L' or 'l' A is a lower triangular matrix.
Unchanged on exit.
TRANSA - CHARACTER*1.
On entry, TRANSA specifies the form of op( A ) to be used in
the matrix multiplication as follows:
TRANSA = 'N' or 'n' op( A ) = A.
TRANSA = 'T' or 't' op( A ) = A'.
TRANSA = 'C' or 'c' op( A ) = conjg( A' ).
Unchanged on exit.
DIAG - CHARACTER*1.
On entry, DIAG specifies whether or not A is unit triangular
as follows:
DIAG = 'U' or 'u' A is assumed to be unit triangular.
DIAG = 'N' or 'n' A is not assumed to be unit
triangular.
Unchanged on exit.
M - INTEGER.
On entry, M specifies the number of rows of B. M must be at
least zero.
Unchanged on exit.
N - INTEGER.
On entry, N specifies the number of columns of B. N must be
at least zero.
Unchanged on exit.
ALPHA - COMPLEX*16 .
On entry, ALPHA specifies the scalar alpha. When alpha is
zero then A is not referenced and B need not be set before
entry.
Unchanged on exit.
A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m
when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'.
Before entry with UPLO = 'U' or 'u', the leading k by k
upper triangular part of the array A must contain the upper
triangular matrix and the strictly lower triangular part of
A is not referenced.
Before entry with UPLO = 'L' or 'l', the leading k by k
lower triangular part of the array A must contain the lower
triangular matrix and the strictly upper triangular part of
A is not referenced.
Note that when DIAG = 'U' or 'u', the diagonal elements of
A are not referenced either, but are assumed to be unity.
Unchanged on exit.
LDA - INTEGER.
On entry, LDA specifies the first dimension of A as declared
in the calling (sub) program. When SIDE = 'L' or 'l' then
LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
then LDA must be at least max( 1, n ).
Unchanged on exit.
B - COMPLEX*16 array of DIMENSION ( LDB, n ).
Before entry, the leading m by n part of the array B must
contain the right-hand side matrix B, and on exit is
overwritten by the solution matrix X.
LDB - INTEGER.
On entry, LDB specifies the first dimension of B as declared
in the calling (sub) program. LDB must be at least
max( 1, m ).
Unchanged on exit.*/
char side_[2] = {side, 0};
char uplo_[2] = {uplo, 0};
char transa_[2] = {transa, 0};
char diag_[2] = {diag, 0};
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
cuDoubleComplex alpha_;
cuDoubleComplex* dw[N_MAX_GPU];
cudaStream_t stream [N_MAX_GPU][3];
magma_int_t lside;
magma_int_t upper;
magma_int_t notransp;
magma_int_t nrowa;
magma_int_t igpu = 0;
magma_int_t k,j,jj,kb,jb,jjb;
magma_int_t ldda, dima, lddb, dimb;
int gpu_b;
magma_getdevice(&gpu_b);
lside = lapackf77_lsame(side_, "L");
if (lside) {
nrowa = m;
} else {
nrowa = n;
}
upper = lapackf77_lsame(uplo_, "U");
notransp = lapackf77_lsame(transa_, "N");
info = 0;
if (! lside && ! lapackf77_lsame(side_, "R")) {
info = 1;
} else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
info = 2;
} else if (! notransp && ! lapackf77_lsame(transa_, "T")
&& ! lapackf77_lsame(transa_, "C")) {
info = 3;
} else if (! lapackf77_lsame(diag_, "U") && ! lapackf77_lsame(diag_, "N")) {
info = 4;
} else if (m < 0) {
info = 5;
} else if (n < 0) {
info = 6;
} else if (lda < max(1,nrowa)) {
info = 9;
} else if (ldb < max(1,m)) {
info = 11;
}
if (info != 0) {
return info;
}
//Quick return if possible.
if (n == 0) {
return info;
}
magma_int_t nbl = (n-1)/nb+1; // number of blocks in a row
magma_int_t mbl = (m-1)/nb+1; // number of blocks in a column
if (lside) {
lddb = m;
dimb = ((nbl-1)/nrgpu+1)*nb;
if ( notransp ) {
ldda = m;
dima = 2 * nb;
} else {
ldda = 2 * nb;
dima = m;
}
} else {
lddb = ((mbl-1)/nrgpu+1)*nb;
dimb = n;
if ( !notransp ) {
ldda = n;
dima = 2 * nb;
} else {
ldda = 2 * nb;
dima = n;
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (dimb*lddb + dima*ldda) )) {
return info;
}
magma_queue_create( &stream[igpu][0] );
magma_queue_create( &stream[igpu][1] );
magma_queue_create( &stream[igpu][2] );
}
// alpha = 0 case;
if (MAGMA_Z_REAL(alpha) == 0. && MAGMA_Z_IMAG(alpha) == 0.) {
printf("ztrsm_m: alpha = 0 not implemented\n");
exit(-1);
return info;
}
if (lside) {
if (notransp) {
//Form B := alpha*inv( A )*B
if (upper) {
//left upper notranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
nloc[igpu] = 0;
//copy B to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
nloc[igpu] += kb;
B(0, k), ldb,
dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
}
jb = min(nb, m-(mbl-1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, mbl-1), lda,
dA(igpu, 0, (mbl-1)%2), ldda, stream[igpu][(mbl+1)%2] );
}
for (j = mbl-1; j >= 0; --j){
if (j > 0){
jb = nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, j-1), lda,
dA(igpu, 0, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
}
}
if (j==mbl-1)
alpha_=alpha;
else
alpha_= c_one;
jb = min(nb, m-j*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
dB(igpu, j, 0), lddb );
}
if (j>0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, 0, j%2), ldda,
dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dB(igpu, j, k/nrgpu), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
else
{
//left lower notranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
nloc[igpu] = 0;
//copy B to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
nloc[igpu] += kb;
B(0, k), ldb,
dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
}
jb = min(nb, m);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, 0), lda,
dA(igpu, 0, 0), ldda, stream[igpu][0] );
}
for (j = 0; j < mbl; ++j){
if ((j+1)*nb < m){
jb = min(nb, m-(j+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (m-(j+1)*nb), jb,
A(j+1, j+1), lda,
dA(igpu, j+1, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
}
}
jb = min(nb, m-j*nb);
if (j==0)
alpha_=alpha;
else
alpha_= c_one;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
dB(igpu, j, 0), lddb );
}
if ( j < mbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j+1, j%2), ldda,
dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dB(igpu, j, k/nrgpu), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
}
else
{
//Form B := alpha*inv( A' )*B
if (upper) {
//left upper transpose or conjtranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
nloc[igpu] = 0;
//copy B to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
nloc[igpu] += kb;
B(0, k), ldb,
dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
}
jb = min(nb, m);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, 0), lda,
dA(igpu, 0, 0), ldda, stream[igpu][0] );
}
for (j = 0; j < mbl; ++j){
if ((j+1)*nb < m){
jb = min(nb, m-(j+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( jb, m-(j+1)*nb,
A(j+1, j+1), lda,
dA(igpu, (j+1)%2, j+1), ldda, stream[igpu][(j+1)%2] );
}
}
jb = min(nb, m-j*nb);
if (j==0)
alpha_=alpha;
else
alpha_= c_one;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
dB(igpu, j, 0), lddb );
}
if ( j < mbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j%2, j+1), ldda,
dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dB(igpu, j, k/nrgpu), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
else
{
//left lower transpose or conjtranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
nloc[igpu] = 0;
//copy B to mgpus
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
nloc[igpu] += kb;
B(0, k), ldb,
dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
}
jb = min(nb, m-(mbl-1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(mbl-1, 0), lda,
dA(igpu, (mbl-1)%2, 0), ldda, stream[igpu][(mbl+1)%2] );
}
for (j = mbl-1; j >= 0; --j){
if (j > 0){
jb = nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
A(j-1, 0), lda,
dA(igpu, (j+1)%2, 0), ldda, stream[igpu][(j+1)%2] );
}
}
if (j==mbl-1)
alpha_=alpha;
else
alpha_= c_one;
jb = min(nb, m-j*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
dB(igpu, j, 0), lddb );
}
if (j>0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][j%2]);
magma_zgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, j%2, 0), ldda,
dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][j%2] );
}
for (k = 0; k < nbl; ++k){
igpu = k%nrgpu;
kb = min(nb, n-k*nb);
dB(igpu, j, k/nrgpu), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
}
}
else
{
if (notransp) {
//Form B := alpha*B*inv( A ).
if (upper) {
//right upper notranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
mloc[igpu] = 0;
//copy B to mgpus
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
mloc[igpu] += jb;
B(j, 0), ldb,
dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
}
kb = min(nb, n);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, 0), lda,
dA(igpu, 0, 0), ldda, stream[igpu][0] );
}
for (k = 0; k < nbl; ++k){
if ((k+1)*nb < n){
kb = min(nb, n-(k+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( kb, n-(k+1)*nb,
A(k+1, k+1), lda,
dA(igpu, (k+1)%2, k+1), ldda, stream[igpu][(k+1)%2] );
}
}
kb = min(nb, n-k*nb);
if (k==0)
alpha_=alpha;
else
alpha_= c_one;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
dB(igpu, 0, k), lddb );
}
if ( k < nbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_zgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
dA(igpu, k%2, k+1), ldda, alpha_, dB(igpu, 0, k+1), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][k%2] );
}
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
dB(igpu, j/nrgpu, k), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
else
{
//right lower notranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
mloc[igpu] = 0;
//copy B to mgpus
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
mloc[igpu] += jb;
B(j, 0), ldb,
dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
}
kb = min(nb, n-(nbl-1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(nbl-1, 0), lda,
dA(igpu, (nbl-1)%2, 0), ldda, stream[igpu][(nbl+1)%2] );
}
for (k = nbl-1; k >= 0; --k){
if (k > 0){
kb = nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
A(k-1, 0), lda,
dA(igpu, (k+1)%2, 0), ldda, stream[igpu][(k+1)%2] );
}
}
if (k==nbl-1)
alpha_=alpha;
else
alpha_= c_one;
kb = min(nb, n-k*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
dB(igpu, 0, k), lddb );
}
if (k>0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_zgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
dA(igpu, k%2, 0), ldda, alpha_, dB(igpu, 0, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][k%2] );
}
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
dB(igpu, j/nrgpu, k), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
}
else
{
//Form B := alpha*B*inv( A' ).
if (upper) {
//right upper transpose or conjtranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
mloc[igpu] = 0;
//copy B to mgpus
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
mloc[igpu] += jb;
B(j, 0), ldb,
dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
}
kb = min(nb, n-(nbl-1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, nbl-1), lda,
dA(igpu, 0, (nbl-1)%2), ldda, stream[igpu][(nbl+1)%2] );
}
for (k = nbl-1; k >= 0; --k){
if (k > 0){
kb = nb;
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, k-1), lda,
dA(igpu, 0, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
}
}
if (k==nbl-1)
alpha_=alpha;
else
alpha_= c_one;
kb = min(nb, n-k*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
dB(igpu, 0, k), lddb );
}
if (k>0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_zgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
dA(igpu, 0, k%2), ldda, alpha_, dB(igpu, 0, 0), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][k%2] );
}
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
dB(igpu, j/nrgpu, k), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
else
{
//right lower transpose or conjtranspose
for(igpu = 0; igpu < nrgpu; ++igpu)
mloc[igpu] = 0;
//copy B to mgpus
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
mloc[igpu] += jb;
B(j, 0), ldb,
dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
}
kb = min(nb, n);
for (igpu = 0; igpu < nrgpu; ++igpu){
A(0, 0), lda,
dA(igpu, 0, 0), ldda, stream[igpu][0] );
}
for (k = 0; k < nbl; ++k){
if ((k+1)*nb < n){
kb = min(nb, n-(k+1)*nb);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (n-(k+1)*nb), kb,
A(k+1, k+1), lda,
dA(igpu, k+1, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
}
}
kb = min(nb, n-k*nb);
if (k==0)
alpha_=alpha;
else
alpha_= c_one;
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
dB(igpu, 0, k), lddb );
}
if ( k < nbl-1 ){
for (igpu = 0; igpu < nrgpu; ++igpu){
magmablasSetKernelStream(stream[igpu][k%2]);
magma_zgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
dA(igpu, k+1, k%2), ldda, alpha_, dB(igpu, 0, k+1), lddb );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][k%2] );
}
for (j = 0; j < mbl; ++j){
igpu = j%nrgpu;
jb = min(nb, m-j*nb);
dB(igpu, j/nrgpu, k), lddb,
B(j, k), ldb, stream[igpu][2] );
}
}
}
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][2] );
magma_queue_destroy( stream[igpu][0] );
magma_queue_destroy( stream[igpu][1] );
magma_queue_destroy( stream[igpu][2] );
magma_free( dw[igpu] );
}
return info;
} /* magma_ztrsm_m */

Here is the call graph for this function:

Here is the caller graph for this function: