MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
magma_z.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define PRECISION_z

Functions

magma_int_t magma_zgebrd (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, double *d, double *e, cuDoubleComplex *tauq, cuDoubleComplex *taup, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgehrd2 (magma_int_t n, magma_int_t ilo, magma_int_t ihi, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t *lwork, magma_int_t *info)
magma_int_t magma_zgehrd (magma_int_t n, magma_int_t ilo, magma_int_t ihi, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, cuDoubleComplex *d_T, magma_int_t *info)
magma_int_t magma_zgelqf (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqlf (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqrf (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqrf_ooc (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgesv (magma_int_t n, magma_int_t nrhs, cuDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, cuDoubleComplex *B, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_zgetrf (magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_zgetrf_mc (magma_context *cntxt, magma_int_t *m, magma_int_t *n, cuDoubleComplex *A, magma_int_t *lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_zgeqrf_mc (magma_context *cntxt, magma_int_t *m, magma_int_t *n, cuDoubleComplex *A, magma_int_t *lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t *lwork, magma_int_t *info)
magma_int_t magma_zgetrf2 (magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_zlatrd (char uplo, magma_int_t n, magma_int_t nb, cuDoubleComplex *a, magma_int_t lda, double *e, cuDoubleComplex *tau, cuDoubleComplex *w, magma_int_t ldw, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *dw, magma_int_t lddw)
magma_int_t magma_zlatrd2 (char uplo, magma_int_t n, magma_int_t nb, cuDoubleComplex *a, magma_int_t lda, double *e, cuDoubleComplex *tau, cuDoubleComplex *w, magma_int_t ldw, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *dw, magma_int_t lddw, cuDoubleComplex *dwork, magma_int_t ldwork)
magma_int_t magma_zlahr2 (magma_int_t m, magma_int_t n, magma_int_t nb, cuDoubleComplex *da, cuDoubleComplex *dv, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *t, magma_int_t ldt, cuDoubleComplex *y, magma_int_t ldy)
magma_int_t magma_zlahru (magma_int_t m, magma_int_t n, magma_int_t nb, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *da, cuDoubleComplex *y, cuDoubleComplex *v, cuDoubleComplex *t, cuDoubleComplex *dwork)
magma_int_t magma_zposv (char uplo, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *B, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_zpotrf (char uplo, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_zpotrf_mc (magma_context *cntxt, char *uplo, magma_int_t *n, cuDoubleComplex *A, magma_int_t *lda, magma_int_t *info)
magma_int_t magma_zpotri (char uplo, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_zlauum (char uplo, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_ztrtri (char uplo, char diag, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_zhetrd (char uplo, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqrf2 (magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqrf3 (magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zungqr (magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *dwork, magma_int_t nb, magma_int_t *info)
magma_int_t magma_zunmql (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zunmqr (char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zunmtr (char side, char uplo, char trans, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zunghr (magma_int_t n, magma_int_t ilo, magma_int_t ihi, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
magma_int_t magma_zheev (char jobz, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, double *w, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info)
magma_int_t magma_zgeev (char jobvl, char jobvr, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *w, cuDoubleComplex *vl, magma_int_t ldvl, cuDoubleComplex *vr, magma_int_t ldvr, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info)
magma_int_t magma_zgesvd (char jobu, char jobvt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, double *s, cuDoubleComplex *u, magma_int_t ldu, cuDoubleComplex *vt, magma_int_t ldvt, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info)
magma_int_t magma_zheevd (char jobz, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, double *w, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_zhegvd (magma_int_t itype, char jobz, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *b, magma_int_t ldb, double *w, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_zhegst (magma_int_t itype, char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *b, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_zgels_gpu (char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *dB, magma_int_t lddb, cuDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgels3_gpu (char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *dB, magma_int_t lddb, cuDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgelqf_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zgeqrf_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dT, magma_int_t *info)
magma_int_t magma_zgeqrf2_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, magma_int_t *info)
magma_int_t magma_zgeqrf3_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dT, magma_int_t *info)
magma_int_t magma_zgeqrs_gpu (magma_int_t m, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dT, cuDoubleComplex *dB, magma_int_t lddb, cuDoubleComplex *hwork, magma_int_t lhwork, magma_int_t *info)
magma_int_t magma_zgeqrs3_gpu (magma_int_t m, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dT, cuDoubleComplex *dB, magma_int_t lddb, cuDoubleComplex *hwork, magma_int_t lhwork, magma_int_t *info)
magma_int_t magma_zgessm_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_zgesv_gpu (magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, cuDoubleComplex *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_zgetrl_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info)
magma_int_t magma_zgetrf_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_zgetrf_nopiv_gpu (magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_zgetrs_gpu (char trans, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, cuDoubleComplex *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_zlabrd_gpu (magma_int_t m, magma_int_t n, magma_int_t nb, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *da, magma_int_t ldda, double *d, double *e, cuDoubleComplex *tauq, cuDoubleComplex *taup, cuDoubleComplex *x, magma_int_t ldx, cuDoubleComplex *dx, magma_int_t lddx, cuDoubleComplex *y, magma_int_t ldy, cuDoubleComplex *dy, magma_int_t lddy)
magma_int_t magma_zlarfb_gpu (char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *dv, magma_int_t ldv, cuDoubleComplex *dt, magma_int_t ldt, cuDoubleComplex *dc, magma_int_t ldc, cuDoubleComplex *dowrk, magma_int_t ldwork)
magma_int_t magma_zposv_gpu (char uplo, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_zpotrf_gpu (char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_zpotri_gpu (char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_zlauum_gpu (char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_ztrtri_gpu (char uplo, char diag, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_zhetrd_gpu (char uplo, magma_int_t n, cuDoubleComplex *da, magma_int_t ldda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *wa, magma_int_t ldwa, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_zhetrd2_gpu (char uplo, magma_int_t n, cuDoubleComplex *da, magma_int_t ldda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *wa, magma_int_t ldwa, cuDoubleComplex *work, magma_int_t lwork, cuDoubleComplex *dwork, magma_int_t ldwork, magma_int_t *info)
magma_int_t magma_zpotrs_gpu (char uplo, magma_int_t n, magma_int_t nrhs, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_zssssm_gpu (char storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, cuDoubleComplex *dA1, magma_int_t ldda1, cuDoubleComplex *dA2, magma_int_t ldda2, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info)
magma_int_t magma_ztstrf_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info)
magma_int_t magma_zungqr_gpu (magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dwork, magma_int_t nb, magma_int_t *info)
magma_int_t magma_zunmql2_gpu (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dc, magma_int_t lddc, cuDoubleComplex *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_zunmqr_gpu (char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, cuDoubleComplex *td, magma_int_t nb, magma_int_t *info)
magma_int_t magma_zunmqr2_gpu (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dc, magma_int_t lddc, cuDoubleComplex *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_zunmtr_gpu (char side, char uplo, char trans, magma_int_t m, magma_int_t n, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *tau, cuDoubleComplex *dc, magma_int_t lddc, cuDoubleComplex *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_zheevd_gpu (char jobz, char uplo, magma_int_t n, cuDoubleComplex *da, magma_int_t ldda, double *w, cuDoubleComplex *wa, magma_int_t ldwa, cuDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_zhegst_gpu (magma_int_t itype, char uplo, magma_int_t n, cuDoubleComplex *da, magma_int_t ldda, cuDoubleComplex *db, magma_int_t lddb, magma_int_t *info)

Macro Definition Documentation

#define PRECISION_z

Definition at line 13 of file magma_z.h.


Function Documentation

magma_int_t magma_zgebrd ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
double *  d,
double *  e,
cuDoubleComplex *  tauq,
cuDoubleComplex *  taup,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 24 of file zgebrd.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEBRD reduces a general complex M-by-N matrix A to upper or lower
bidiagonal form B by an orthogonal transformation: Q**H * A * P = B.
If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
Arguments
=========
M (input) INTEGER
The number of rows in the matrix A. M >= 0.
N (input) INTEGER
The number of columns in the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N general matrix to be reduced.
On exit,
if m >= n, the diagonal and the first superdiagonal are
overwritten with the upper bidiagonal matrix B; the
elements below the diagonal, with the array TAUQ, represent
the orthogonal matrix Q as a product of elementary
reflectors, and the elements above the first superdiagonal,
with the array TAUP, represent the orthogonal matrix P as
a product of elementary reflectors;
if m < n, the diagonal and the first subdiagonal are
overwritten with the lower bidiagonal matrix B; the
elements below the first subdiagonal, with the array TAUQ,
represent the orthogonal matrix Q as a product of
elementary reflectors, and the elements above the diagonal,
with the array TAUP, represent the orthogonal matrix P as
a product of elementary reflectors.
See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
D (output) double precision array, dimension (min(M,N))
The diagonal elements of the bidiagonal matrix B:
D(i) = A(i,i).
E (output) double precision array, dimension (min(M,N)-1)
The off-diagonal elements of the bidiagonal matrix B:
if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
TAUQ (output) COMPLEX_16 array dimension (min(M,N))
The scalar factors of the elementary reflectors which
represent the orthogonal matrix Q. See Further Details.
TAUP (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors which
represent the orthogonal matrix P. See Further Details.
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,M,N).
For optimum performance LWORK >= (M+N)*NB, where NB
is the optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrices Q and P are represented as products of elementary
reflectors:
If m >= n,
Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1)
Each H(i) and G(i) has the form:
H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
where tauq and taup are complex scalars, and v and u are complex vectors;
v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
tauq is stored in TAUQ(i) and taup in TAUP(i).
If m < n,
Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m)
Each H(i) and G(i) has the form:
H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
where tauq and taup are complex scalars, and v and u are complex vectors;
v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
tauq is stored in TAUQ(i) and taup in TAUP(i).
The contents of A on exit are illustrated by the following examples:
m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n):
( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 )
( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 )
( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 )
( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 )
( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 )
( v1 v2 v3 v4 v5 )
where d and e denote diagonal and off-diagonal elements of B, vi
denotes an element of the vector defining H(i), and ui an element of
the vector defining G(i).
===================================================================== */
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex *da, *dwork;
magma_int_t ncol, nrow, jmax, nb, ldda;
static magma_int_t i, j, nx;
static cuDoubleComplex ws;
static magma_int_t iinfo;
magma_int_t minmn;
magma_int_t ldwrkx, ldwrky, lwkopt;
magma_int_t lquery;
ldda = m;
lwkopt = (m + n) * nb;
work[0] = MAGMA_Z_MAKE( lwkopt, 0. );
lquery = (lwork == -1);
/* Check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if ( (lwork < max( max(1, m), n)) && (! lquery) ) {
*info = -10;
}
if (*info < 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
minmn = min(m,n);
if (minmn == 0) {
work[0] = c_one;
return *info;
}
if (MAGMA_SUCCESS != magma_zmalloc( &da, n*ldda + (m + n)*nb )) {
fprintf (stderr, "!!!! device memory allocation error in zgebrd\n" );
return *info;
}
dwork = da + (n)*ldda;
MAGMA_Z_SET2REAL( ws, max(m,n) );
ldwrkx = m;
ldwrky = n;
/* Set the block/unblock crossover point NX. */
nx = 128;
/* Copy the matrix to the GPU */
if (minmn-nx>=1)
magma_zsetmatrix( m, n, a, lda, da, ldda );
for (i=0; i< (minmn - nx); i += nb) {
/* Reduce rows and columns i:i+nb-1 to bidiagonal form and return
the matrices X and Y which are needed to update the unreduced
part of the matrix */
nrow = m - i;
ncol = n - i;
/* Get the current panel (no need for the 1st iteration) */
if ( i > 0 ) {
magma_zgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda );
magma_zgetmatrix( nb, ncol - nb,
dA(i, i+nb), ldda,
A( i, i+nb), lda );
}
magma_zlabrd_gpu(nrow, ncol, nb,
A(i, i), lda, dA(i, i), ldda,
d+i, e+i, tauq+i, taup+i,
work, ldwrkx, dwork, ldwrkx, // x, dx
work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy
/* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
of the form A := A - V*Y' - X*U' */
nrow = m - i - nb;
ncol = n - i - nb;
// Send Y back to the GPU
magma_zsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx );
magma_zsetmatrix( ncol, nb,
work + (ldwrkx+1)*nb, ldwrky,
dwork + (ldwrkx+1)*nb, ldwrky );
nrow, ncol, nb,
c_neg_one, dA(i+nb, i ), ldda,
dwork+(ldwrkx+1)*nb, ldwrky,
c_one, dA(i+nb, i+nb), ldda);
nrow, ncol, nb,
c_neg_one, dwork+nb, ldwrkx,
dA( i, i+nb ), ldda,
c_one, dA( i+nb, i+nb ), ldda);
/* Copy diagonal and off-diagonal elements of B back into A */
if (m >= n) {
jmax = i + nb;
for (j = i; j < jmax; ++j) {
*A(j, j ) = MAGMA_Z_MAKE( d[j], 0. );
*A(j, j+1) = MAGMA_Z_MAKE( e[j], 0. );
}
} else {
jmax = i + nb;
for (j = i; j < jmax; ++j) {
*A(j, j ) = MAGMA_Z_MAKE( d[j], 0. );
*A(j+1, j ) = MAGMA_Z_MAKE( e[j], 0. );
/* L20: */
}
}
}
/* Use unblocked code to reduce the remainder of the matrix */
nrow = m - i;
ncol = n - i;
if ( 0 < (minmn-nx) )
magma_zgetmatrix( nrow, ncol, dA(i, i), ldda, A( i, i), lda );
lapackf77_zgebrd( &nrow, &ncol,
A(i, i), &lda, d+i, e+i,
tauq+i, taup+i, work, &lwork, &iinfo);
work[0] = ws;
magma_free( da );
return *info;
} /* zgebrd_ */

Here is the caller graph for this function:

magma_int_t magma_zgeev ( char  jobvl,
char  jobvr,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  w,
cuDoubleComplex *  vl,
magma_int_t  ldvl,
cuDoubleComplex *  vr,
magma_int_t  ldvr,
cuDoubleComplex *  work,
magma_int_t  lwork,
double *  rwork,
magma_int_t info 
)

Definition at line 24 of file zgeev.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the
eigenvalues and, optionally, the left and/or right eigenvectors.
The right eigenvector v(j) of A satisfies
A * v(j) = lambda(j) * v(j)
where lambda(j) is its eigenvalue.
The left eigenvector u(j) of A satisfies
u(j)**H * A = lambda(j) * u(j)**H
where u(j)**H denotes the conjugate transpose of u(j).
The computed eigenvectors are normalized to have Euclidean norm
equal to 1 and largest component real.
Arguments
=========
JOBVL (input) CHARACTER*1
= 'N': left eigenvectors of A are not computed;
= 'V': left eigenvectors of are computed.
JOBVR (input) CHARACTER*1
= 'N': right eigenvectors of A are not computed;
= 'V': right eigenvectors of A are computed.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) COMPLEX*16 array, dimension (LDA,N)
On entry, the N-by-N matrix A.
On exit, A has been overwritten.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
W (output) COMPLEX*16 array, dimension (N)
W contains the computed eigenvalues.
VL (output) COMPLEX*16 array, dimension (LDVL,N)
If JOBVL = 'V', the left eigenvectors u(j) are stored one
after another in the columns of VL, in the same order
as their eigenvalues.
If JOBVL = 'N', VL is not referenced.
u(j) = VL(:,j), the j-th column of VL.
LDVL (input) INTEGER
The leading dimension of the array VL. LDVL >= 1; if
JOBVL = 'V', LDVL >= N.
VR (output) COMPLEX*16 array, dimension (LDVR,N)
If JOBVR = 'V', the right eigenvectors v(j) are stored one
after another in the columns of VR, in the same order
as their eigenvalues.
If JOBVR = 'N', VR is not referenced.
v(j) = VR(:,j), the j-th column of VR.
LDVR (input) INTEGER
The leading dimension of the array VR. LDVR >= 1; if
JOBVR = 'V', LDVR >= N.
WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= (1+nb)*N.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
RWORK (workspace) DOUBLE PRECISION array, dimension (2*N)
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
> 0: if INFO = i, the QR algorithm failed to compute all the
eigenvalues, and no eigenvectors have been computed;
elements and i+1:N of W contain eigenvalues which have
converged.
===================================================================== */
magma_int_t c__1 = 1;
magma_int_t c__0 = 0;
magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
i__2, i__3;
double d__1, d__2;
cuDoubleComplex z__1, z__2;
magma_int_t i__, k, ihi;
double scl;
double dum[1], eps;
cuDoubleComplex tmp;
double anrm;
magma_int_t ierr, itau, iwrk, nout;
magma_int_t scalea;
double cscale;
magma_int_t select[1];
static double bignum;
magma_int_t minwrk;
magma_int_t wantvl;
static double smlnum;
static magma_int_t irwork;
magma_int_t lquery, wantvr;
magma_int_t nb = 0;
cuDoubleComplex *dT = NULL;
//magma_timestr_t start, end;
char side[2] = {0, 0};
char jobvl_[2] = {jobvl, 0};
char jobvr_[2] = {jobvr, 0};
*info = 0;
lquery = lwork == -1;
wantvl = lapackf77_lsame(jobvl_, "V");
wantvr = lapackf77_lsame(jobvr_, "V");
if (! wantvl && ! lapackf77_lsame(jobvl_, "N")) {
*info = -1;
} else if (! wantvr && ! lapackf77_lsame(jobvr_, "N")) {
*info = -2;
} else if (n < 0) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
*info = -8;
} else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
*info = -10;
}
/* Compute workspace */
if (*info == 0) {
minwrk = (1+nb)*n;
work[0] = MAGMA_Z_MAKE((double) minwrk, 0.);
if (lwork < minwrk && ! lquery) {
*info = -12;
}
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (n == 0) {
return *info;
}
// if eigenvectors are needed
#if defined(VERSION3)
if (MAGMA_SUCCESS != magma_zmalloc( &dT, nb*n )) {
return *info;
}
#endif
a_dim1 = lda;
a_offset = 1 + a_dim1;
a -= a_offset;
vl_dim1 = ldvl;
vl_offset = 1 + vl_dim1;
vl -= vl_offset;
vr_dim1 = ldvr;
vr_offset = 1 + vr_dim1;
vr -= vr_offset;
--work;
--rwork;
/* Get machine constants */
eps = lapackf77_dlamch("P");
smlnum = lapackf77_dlamch("S");
bignum = 1. / smlnum;
lapackf77_dlabad(&smlnum, &bignum);
smlnum = magma_dsqrt(smlnum) / eps;
bignum = 1. / smlnum;
/* Scale A if max element outside range [SMLNUM,BIGNUM] */
anrm = lapackf77_zlange("M", &n, &n, &a[a_offset], &lda, dum);
scalea = 0;
if (anrm > 0. && anrm < smlnum) {
scalea = 1;
cscale = smlnum;
} else if (anrm > bignum) {
scalea = 1;
cscale = bignum;
}
if (scalea) {
lapackf77_zlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &
ierr);
}
/* Balance the matrix
(CWorkspace: none)
(RWorkspace: need N) */
ibal = 1;
lapackf77_zgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &rwork[ibal], &ierr);
/* Reduce to upper Hessenberg form
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: none) */
itau = 1;
iwrk = itau + n;
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1)
/*
* Version 1 - LAPACK
*/
lapackf77_zgehrd(&n, &ilo, &ihi, &a[a_offset], &lda,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION2)
/*
* Version 2 - LAPACK consistent HRD
*/
magma_zgehrd2(n, ilo, ihi, &a[a_offset], lda,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored,
*/
magma_zgehrd(n, ilo, ihi, &a[a_offset], lda,
&work[itau], &work[iwrk], i__1, dT, &ierr);
#endif
//end = get_current_time();
//printf(" Time for zgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.);
if (wantvl) {
/* Want left eigenvectors
Copy Householder vectors to VL */
side[0] = 'L';
&a[a_offset], &lda, &vl[vl_offset], &ldvl);
/* Generate unitary matrix in VL
(CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
(RWorkspace: none) */
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
/*
* Version 1 & 2 - LAPACK
*/
lapackf77_zunghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
*/
magma_zunghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau],
dT, nb, &ierr);
#endif
//end = get_current_time();
//printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);
/* Perform QR iteration, accumulating Schur vectors in VL
(CWorkspace: need 1, prefer HSWORK (see comments) )
(RWorkspace: none) */
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array,
&vl[vl_offset], &ldvl, &work[iwrk], &i__1, info);
if (wantvr)
{
/* Want left and right eigenvectors
Copy Schur vectors to VR */
side[0] = 'B';
lapackf77_zlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr);
}
} else if (wantvr) {
/* Want right eigenvectors
Copy Householder vectors to VR */
side[0] = 'R';
lapackf77_zlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr);
/* Generate unitary matrix in VR
(CWorkspace: need 2*N-1, prefer N+(N-1)*NB)
(RWorkspace: none) */
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
/*
* Version 1 & 2 - LAPACK
*/
lapackf77_zunghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
*/
magma_zunghr(n, ilo, ihi, &vr[vr_offset], ldvr,
&work[itau], dT, nb, &ierr);
#endif
//end = get_current_time();
//printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);
/* Perform QR iteration, accumulating Schur vectors in VR
(CWorkspace: need 1, prefer HSWORK (see comments) )
(RWorkspace: none) */
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array,
&vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
} else {
/* Compute eigenvalues only
(CWorkspace: need 1, prefer HSWORK (see comments) )
(RWorkspace: none) */
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_zhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array,
&vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
}
/* If INFO > 0 from ZHSEQR, then quit */
if (*info > 0) {
goto L50;
}
if (wantvl || wantvr) {
/* Compute left and/or right eigenvectors
(CWorkspace: need 2*N)
(RWorkspace: need 2*N) */
irwork = ibal + n;
lapackf77_ztrevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl,
&vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &rwork[irwork],
&ierr);
}
if (wantvl) {
/* Undo balancing of left eigenvectors
(CWorkspace: none)
(RWorkspace: need N) */
lapackf77_zgebak("B", "L", &n, &ilo, &ihi, &rwork[ibal], &n,
&vl[vl_offset], &ldvl, &ierr);
/* Normalize left eigenvectors and make largest component real */
for (i__ = 1; i__ <= n; ++i__) {
scl = 1. / cblas_dznrm2(n, &vl[i__ * vl_dim1 + 1], 1);
cblas_zdscal(n, scl, &vl[i__ * vl_dim1 + 1], 1);
i__2 = n;
for (k = 1; k <= i__2; ++k)
{
i__3 = k + i__ * vl_dim1;
/* Computing 2nd power */
d__1 = MAGMA_Z_REAL(vl[i__3]);
/* Computing 2nd power */
d__2 = MAGMA_Z_IMAG(vl[k + i__ * vl_dim1]);
rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
}
/* Comment:
Fortran BLAS does not have to add 1
C BLAS must add one to cblas_idamax */
k = cblas_idamax(n, &rwork[irwork], 1)+1;
z__2 = MAGMA_Z_CNJG(vl[k + i__ * vl_dim1]);
d__1 = magma_dsqrt(rwork[irwork + k - 1]);
MAGMA_Z_DSCALE(z__1, z__2, d__1);
tmp = z__1;
cblas_zscal(n, CBLAS_SADDR(tmp), &vl[i__ * vl_dim1 + 1], 1);
i__2 = k + i__ * vl_dim1;
i__3 = k + i__ * vl_dim1;
d__1 = MAGMA_Z_REAL(vl[i__3]);
MAGMA_Z_SET2REAL(z__1, d__1);
vl[i__2] = z__1;
}
}
if (wantvr) {
/* Undo balancing of right eigenvectors
(CWorkspace: none)
(RWorkspace: need N) */
lapackf77_zgebak("B", "R", &n, &ilo, &ihi, &rwork[ibal], &n,
&vr[vr_offset], &ldvr, &ierr);
/* Normalize right eigenvectors and make largest component real */
for (i__ = 1; i__ <= n; ++i__) {
scl = 1. / cblas_dznrm2(n, &vr[i__ * vr_dim1 + 1], 1);
cblas_zdscal(n, scl, &vr[i__ * vr_dim1 + 1], 1);
i__2 = n;
for (k = 1; k <= i__2; ++k) {
i__3 = k + i__ * vr_dim1;
/* Computing 2nd power */
d__1 = MAGMA_Z_REAL(vr[i__3]);
/* Computing 2nd power */
d__2 = MAGMA_Z_IMAG(vr[k + i__ * vr_dim1]);
rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2;
}
/* Comment:
Fortran BLAS does not have to add 1
C BLAS must add one to cblas_idamax */
k = cblas_idamax(n, &rwork[irwork], 1)+1;
z__2 = MAGMA_Z_CNJG(vr[k + i__ * vr_dim1]);
d__1 = magma_dsqrt(rwork[irwork + k - 1]);
MAGMA_Z_DSCALE(z__1, z__2, d__1);
tmp = z__1;
cblas_zscal(n, CBLAS_SADDR(tmp), &vr[i__ * vr_dim1 + 1], 1);
i__2 = k + i__ * vr_dim1;
i__3 = k + i__ * vr_dim1;
d__1 = MAGMA_Z_REAL(vr[i__3]);
MAGMA_Z_SET2REAL(z__1, d__1);
vr[i__2] = z__1;
}
}
/* Undo scaling if necessary */
L50:
if (scalea) {
i__1 = n - *info;
/* Computing MAX */
i__3 = n - *info;
i__2 = max(i__3,1);
lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
geev_w_array + *info, &i__2, &ierr);
if (*info > 0) {
i__1 = ilo - 1;
lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
geev_w_array, &n, &ierr);
}
}
#if defined(VERSION3)
magma_free( dT );
#endif
return *info;
} /* magma_zgeev */

Here is the caller graph for this function:

magma_int_t magma_zgehrd ( magma_int_t  n,
magma_int_t  ilo,
magma_int_t  ihi,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
cuDoubleComplex *  d_T,
magma_int_t info 
)

Definition at line 14 of file zgehrd.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEHRD reduces a COMPLEX_16 general matrix A to upper Hessenberg form H by
an orthogonal similarity transformation: Q' * A * Q = H . This version
stores the triangular matrices used in the factorization so that they can
be applied directly (i.e., without being recomputed) later. As a result,
the application of Q is much faster.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
ILO (input) INTEGER
IHI (input) INTEGER
It is assumed that A is already upper triangular in rows
and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
set by a previous call to DGEBAL; otherwise they should be
set to 1 and N respectively. See Further Details.
1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the N-by-N general matrix to be reduced.
On exit, the upper triangle and the first subdiagonal of A
are overwritten with the upper Hessenberg matrix H, and the
elements below the first subdiagonal, with the array TAU,
represent the orthogonal matrix Q as a product of elementary
reflectors. See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
TAU (output) COMPLEX_16 array, dimension (N-1)
The scalar factors of the elementary reflectors (see Further
Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
zero.
WORK (workspace/output) COMPLEX_16 array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
dT (output) COMPLEX_16 array on the GPU, dimension N*NB,
where NB is the optimal blocksize. It stores the NB*NB blocks
of the triangular T matrices, used the the reduction.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrix Q is represented as a product of (ihi-ilo) elementary
reflectors
Q = H(ilo) H(ilo+1) . . . H(ihi-1).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
exit in A(i+2:ihi,i), and tau in TAU(i).
The contents of A are illustrated by the following example, with
n = 7, ilo = 2 and ihi = 6:
on entry, on exit,
( a a a a a a a ) ( a a h h h h a )
( a a a a a a ) ( a h h h h a )
( a a a a a a ) ( h h h h h h )
( a a a a a a ) ( v2 h h h h h )
( a a a a a a ) ( v2 v3 h h h h )
( a a a a a a ) ( v2 v3 v4 h h h )
( a ) ( a )
where a denotes an element of the original matrix A, h denotes a
modified element of the upper Hessenberg matrix H, and vi denotes an
element of the vector defining H(i).
This implementation follows the hybrid algorithm and notations described in
S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
form through hybrid GPU-based computing," University of Tennessee Computer
Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
May 24, 2009.
===================================================================== */
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_zero = MAGMA_Z_ZERO;
magma_int_t N = n, ldda = n;
magma_int_t nh, iws;
magma_int_t nbmin, iinfo;
magma_int_t ldwork;
magma_int_t lquery;
--tau;
*info = 0;
MAGMA_Z_SET2REAL( work[0], (double) n * nb );
lquery = lwork == -1;
if (n < 0) {
*info = -1;
} else if (ilo < 1 || ilo > max(1,n)) {
*info = -2;
} else if (ihi < min(ilo,n) || ihi > n) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if (lwork < max(1,n) && ! lquery) {
*info = -8;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
nh = ihi - ilo + 1;
if (nh <= 1) {
work[0] = c_one;
return *info;
}
cuDoubleComplex *da;
if (MAGMA_SUCCESS != magma_zmalloc( &da, N*ldda + 2*N*nb + nb*nb )) {
return *info;
}
cuDoubleComplex *d_A = da;
cuDoubleComplex *d_work = da + (N+nb)*ldda;
cuDoubleComplex *t, *d_t;
t = (cuDoubleComplex*) malloc(nb*nb*sizeof(cuDoubleComplex));
if ( t == NULL ) {
magma_free( da );
return *info;
}
d_t = d_work + nb * ldda;
zzero_nbxnb_block(nb, d_A+N*ldda, ldda);
/* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
for (i__ = 1; i__ < ilo; ++i__)
tau[i__] = c_zero;
for (i__ = max(1,ihi); i__ < n; ++i__)
tau[i__] = c_zero;
for(i__=0; i__< nb*nb; i__+=4)
t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero;
nbmin = 2;
iws = 1;
if (nb > 1 && nb < nh) {
/* Determine when to cross over from blocked to unblocked code
(last block is always handled by unblocked code) */
if (nb < nh) {
/* Determine if workspace is large enough for blocked code */
iws = n * nb;
if (lwork < iws) {
/* Not enough workspace to use optimal NB: determine the
minimum value of NB, and reduce NB or force use of
unblocked code */
nbmin = nb;
if (lwork >= n * nbmin)
nb = lwork / n;
else
nb = 1;
}
}
}
ldwork = n;
if (nb < nbmin || nb >= nh) {
/* Use unblocked code below */
i__ = ilo;
} else {
/* Use blocked code */
/* Copy the matrix to the GPU */
magma_zsetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda );
for (i__ = ilo; i__ < ihi - nb; i__ += nb) {
/* Computing MIN */
ib = min(nb, ihi - i__);
/* Reduce columns i:i+ib-1 to Hessenberg form, returning the
matrices V and T of the block reflector H = I - V*T*V'
which performs the reduction, and also the matrix Y = A*V*T */
/* Get the current panel (no need for the 1st iteration) */
magma_zgetmatrix( ihi-i__+1, ib,
d_A + (i__ - ilo)*ldda + i__ - 1, ldda,
a + (i__ - 1 )*lda + i__ - 1, lda );
magma_zlahr2(ihi, i__, ib,
d_A + (i__ - ilo)*ldda,
d_A + N*ldda + 1,
a + (i__ - 1 )*(lda) , lda,
&tau[i__], t, nb, work, ldwork);
/* Copy T from the CPU to D_T on the GPU */
d_t = dT + (i__ - ilo)*nb;
magma_zsetmatrix( nb, nb, t, nb, d_t, nb );
magma_zlahru(n, ihi, i__ - 1, ib,
a + (i__ - 1 )*(lda), lda,
d_A + (i__ - ilo)*ldda,
d_A + (i__ - ilo)*ldda + i__ - 1,
d_A + N*ldda, d_t, d_work);
}
}
/* Use unblocked code to reduce the rest of the matrix */
if (!(nb < nbmin || nb >= nh))
magma_zgetmatrix( n, n-i__+1,
d_A+ (i__-ilo)*ldda, ldda,
a + (i__-1)*(lda), lda );
lapackf77_zgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo);
MAGMA_Z_SET2REAL( work[0], (double) iws );
magma_free( da );
free(t);
return *info;
} /* magma_zgehrd */

Here is the caller graph for this function:

magma_int_t magma_zgehrd2 ( magma_int_t  n,
magma_int_t  ilo,
magma_int_t  ihi,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t lwork,
magma_int_t info 
)

Definition at line 14 of file zgehrd2.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEHRD2 reduces a COMPLEX_16 general matrix A to upper Hessenberg form H by
an orthogonal similarity transformation: Q' * A * Q = H .
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
ILO (input) INTEGER
IHI (input) INTEGER
It is assumed that A is already upper triangular in rows
and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
set by a previous call to DGEBAL; otherwise they should be
set to 1 and N respectively. See Further Details.
1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the N-by-N general matrix to be reduced.
On exit, the upper triangle and the first subdiagonal of A
are overwritten with the upper Hessenberg matrix H, and the
elements below the first subdiagonal, with the array TAU,
represent the orthogonal matrix Q as a product of elementary
reflectors. See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
TAU (output) COMPLEX_16 array, dimension (N-1)
The scalar factors of the elementary reflectors (see Further
Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
zero.
WORK (workspace/output) COMPLEX_16 array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrix Q is represented as a product of (ihi-ilo) elementary
reflectors
Q = H(ilo) H(ilo+1) . . . H(ihi-1).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
exit in A(i+2:ihi,i), and tau in TAU(i).
The contents of A are illustrated by the following example, with
n = 7, ilo = 2 and ihi = 6:
on entry, on exit,
( a a a a a a a ) ( a a h h h h a )
( a a a a a a ) ( a h h h h a )
( a a a a a a ) ( h h h h h h )
( a a a a a a ) ( v2 h h h h h )
( a a a a a a ) ( v2 v3 h h h h )
( a a a a a a ) ( v2 v3 v4 h h h )
( a ) ( a )
where a denotes an element of the original matrix A, h denotes a
modified element of the upper Hessenberg matrix H, and vi denotes an
element of the vector defining H(i).
This implementation follows the hybrid algorithm and notations described in
S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
form through hybrid GPU-based computing," University of Tennessee Computer
Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
May 24, 2009.
===================================================================== */
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_zero = MAGMA_Z_ZERO;
magma_int_t N = n, ldda = n;
magma_int_t nh, iws;
magma_int_t nbmin, iinfo;
magma_int_t ldwork;
magma_int_t lquery;
--tau;
*info = 0;
MAGMA_Z_SET2REAL( work[0], (double) n * nb );
lquery = *lwork == -1;
if (n < 0) {
*info = -1;
} else if (ilo < 1 || ilo > max(1,n)) {
*info = -2;
} else if (ihi < min(ilo,n) || ihi > n) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if (*lwork < max(1,n) && ! lquery) {
*info = -8;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
nh = ihi - ilo + 1;
if (nh <= 1) {
work[0] = c_one;
return *info;
}
cuDoubleComplex *da;
if (MAGMA_SUCCESS != magma_zmalloc( &da, N*ldda + 2*N*nb + nb*nb )) {
return *info;
}
cuDoubleComplex *d_A = da;
cuDoubleComplex *d_work = da + (N+nb)*ldda;
cuDoubleComplex *t, *d_t;
t = (cuDoubleComplex*) malloc( nb*nb * sizeof(cuDoubleComplex));
if ( t == NULL ) {
magma_free( da );
return *info;
}
d_t = d_work + nb * ldda;
zzero_nbxnb_block(nb, d_A+N*ldda, ldda);
/* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
for (i__ = 1; i__ < ilo; ++i__)
tau[i__] = c_zero;
for (i__ = max(1,ihi); i__ < n; ++i__)
tau[i__] = c_zero;
for(i__=0; i__< nb*nb; i__+=4)
t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero;
nbmin = 2;
iws = 1;
if (nb > 1 && nb < nh) {
/* Determine when to cross over from blocked to unblocked code
(last block is always handled by unblocked code) */
if (nb < nh) {
/* Determine if workspace is large enough for blocked code */
iws = n * nb;
if (*lwork < iws) {
/* Not enough workspace to use optimal NB: determine the
minimum value of NB, and reduce NB or force use of
unblocked code */
nbmin = nb;
if (*lwork >= n * nbmin)
nb = *lwork / n;
else
nb = 1;
}
}
}
ldwork = n;
if (nb < nbmin || nb >= nh) {
/* Use unblocked code below */
i__ = ilo;
} else {
/* Use blocked code */
/* Copy the matrix to the GPU */
magma_zsetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda );
for (i__ = ilo; i__ < ihi - nb; i__ += nb) {
/* Computing MIN */
ib = min(nb, ihi - i__);
/* Reduce columns i:i+ib-1 to Hessenberg form, returning the
matrices V and T of the block reflector H = I - V*T*V'
which performs the reduction, and also the matrix Y = A*V*T */
/* Get the current panel (no need for the 1st iteration) */
magma_zgetmatrix( ihi-i__+1, ib,
d_A + (i__ - ilo)*ldda + i__ - 1, ldda,
a + (i__ - 1 )*lda + i__ - 1, lda );
magma_zlahr2(ihi, i__, ib,
d_A + (i__ - ilo)*ldda,
d_A + N*ldda + 1,
a + (i__ - 1 )*(lda) , lda,
&tau[i__], t, nb, work, ldwork);
/* Copy T from the CPU to D_T on the GPU */
magma_zsetmatrix( nb, nb, t, nb, d_t, nb );
magma_zlahru(n, ihi, i__ - 1, ib,
a + (i__ - 1 )*(lda), lda,
d_A + (i__ - ilo)*ldda,
d_A + (i__ - ilo)*ldda + i__ - 1,
d_A + N*ldda, d_t, d_work);
}
}
/* Use unblocked code to reduce the rest of the matrix */
if (!(nb < nbmin || nb >= nh))
magma_zgetmatrix( n, n-i__+1,
d_A+ (i__-ilo)*ldda, ldda,
a + (i__-1)*(lda), lda );
lapackf77_zgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo);
MAGMA_Z_SET2REAL( work[0], (double) iws );
magma_free( da );
free(t);
return *info;
} /* magma_zgehrd2 */

Here is the caller graph for this function:

magma_int_t magma_zgelqf ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgelqf.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGELQF computes an LQ factorization of a COMPLEX_16 M-by-N matrix A:
A = L * Q.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and below the diagonal of the array
contain the m-by-min(m,n) lower trapezoidal matrix L (L is
lower triangular if m <= n); the elements above the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of elementary reflectors (see Further Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,M).
For optimum performance LWORK >= M*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
if INFO = -10 internal GPU memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
cuDoubleComplex *dA, *dAT;
cuDoubleComplex c_one = MAGMA_Z_ONE;
magma_int_t maxm, maxn, maxdim, nb;
magma_int_t iinfo, ldda;
long int lquery;
/* Function Body */
*info = 0;
work[0] = MAGMA_Z_MAKE( (double)(m*nb), 0 );
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,m) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (min(m, n) == 0) {
work[0] = c_one;
return *info;
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
if (maxdim*maxdim < 2*maxm*maxn)
{
ldda = maxdim;
if (MAGMA_SUCCESS != magma_zmalloc( &dA, maxdim*maxdim )) {
return *info;
}
magma_zsetmatrix( m, n, a, lda, dA, ldda );
dAT = dA;
magmablas_zinplace_transpose( dAT, ldda, ldda );
}
else
{
ldda = maxn;
if (MAGMA_SUCCESS != magma_zmalloc( &dA, 2*maxn*maxm )) {
return *info;
}
magma_zsetmatrix( m, n, a, lda, dA, maxm );
dAT = dA + maxn * maxm;
magmablas_ztranspose2( dAT, ldda, dA, maxm, m, n );
}
magma_zgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);
if (maxdim*maxdim< 2*maxm*maxn){
magmablas_zinplace_transpose( dAT, ldda, ldda );
magma_zgetmatrix( m, n, dA, ldda, a, lda );
} else {
magmablas_ztranspose2( dA, maxm, dAT, ldda, n, m );
magma_zgetmatrix( m, n, dA, maxm, a, lda );
}
magma_free( dA );
return *info;
} /* magma_zgelqf */

Here is the caller graph for this function:

magma_int_t magma_zgelqf_gpu ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgelqf_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGELQF computes an LQ factorization of a COMPLEX_16 M-by-N matrix dA:
dA = L * Q.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and below the diagonal of the array
contain the m-by-min(m,n) lower trapezoidal matrix L (L is
lower triangular if m <= n); the elements above the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of elementary reflectors (see Further Details).
LDA (input) INTEGER
The leading dimension of the array dA. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,M).
For optimum performance LWORK >= M*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
if INFO = -10 internal GPU memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( dA+(a_2)*(lda) + (a_1))
cuDoubleComplex *dAT;
cuDoubleComplex c_one = MAGMA_Z_ONE;
magma_int_t maxm, maxn, maxdim, nb;
magma_int_t iinfo;
long int lquery;
*info = 0;
work[0] = MAGMA_Z_MAKE( (double)(m*nb), 0 );
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,m) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (min(m, n) == 0) {
work[0] = c_one;
return *info;
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
int ldat = maxn;
dAT = dA;
if ((m == n) && (m % 32 == 0) && (lda%32 == 0)){
ldat = lda;
magmablas_zinplace_transpose( dAT, lda, maxm );
}
else {
if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn ) ){
return *info;
}
magmablas_ztranspose2( dAT, ldat, dA, lda, m, n );
}
magma_zgeqrf2_gpu(n, m, dAT, ldat, tau, &iinfo);
if ((m == n) && (m % 32 == 0) && (lda%32 == 0)){
magmablas_zinplace_transpose( dAT, ldat, maxm );
}
else {
magmablas_ztranspose2( dA, lda, dAT, ldat, n, m );
magma_free( dAT );
}
return *info;
} /* magma_zgelqf_gpu */

Here is the caller graph for this function:

magma_int_t magma_zgels3_gpu ( char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  dB,
magma_int_t  lddb,
cuDoubleComplex *  hwork,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgels3_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the overdetermined, least squares problem
min || A*X - C ||
using the QR factorization A.
The underdetermined problem (m < n) is not currently handled.
Arguments
=========
TRANS (input) CHARACTER*1
= 'N': the linear system involves A.
Only trans='N' is currently handled.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, A is overwritten by details of its QR
factorization as returned by ZGEQRF3.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) COMPLEX_16 array, dimension MAX(1,LWORK).
On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array HWORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_zgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the HWORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
cuDoubleComplex *dT, *tau;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );
*info = 0;
/* For now, N is the only case working */
if ( (trans != 'N') && (trans != 'n' ) )
*info = -1;
else if (m < 0)
*info = -2;
else if (n < 0 || m < n) /* LQ is not handle for now*/
*info = -3;
else if (nrhs < 0)
*info = -4;
else if (ldda < max(1,m))
*info = -6;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
return *info;
}
/*
* Allocate temporary buffers
*/
int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
if (nb < nrhs)
ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) {
return *info;
}
tau = (cuDoubleComplex*) malloc( k * sizeof(cuDoubleComplex) );
if( tau == NULL ) {
magma_free( dT );
return *info;
}
magma_zgeqrf3_gpu( m, n, dA, ldda, tau, dT, info );
if ( *info == 0 ) {
magma_zgeqrs3_gpu( m, n, nrhs,
dA, ldda, tau, dT,
dB, lddb, hwork, lwork, info );
}
magma_free( dT );
free(tau);
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgels_gpu ( char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  dB,
magma_int_t  lddb,
cuDoubleComplex *  hwork,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgels_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the overdetermined, least squares problem
min || A*X - C ||
using the QR factorization A.
The underdetermined problem (m < n) is not currently handled.
Arguments
=========
TRANS (input) CHARACTER*1
= 'N': the linear system involves A.
Only trans='N' is currently handled.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, A is overwritten by details of its QR
factorization as returned by ZGEQRF.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) COMPLEX_16 array, dimension MAX(1,LWORK).
On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array HWORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_zgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the HWORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
cuDoubleComplex *dT, *tau;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );
*info = 0;
/* For now, N is the only case working */
if ( (trans != 'N') && (trans != 'n' ) )
*info = -1;
else if (m < 0)
*info = -2;
else if (n < 0 || m < n) /* LQ is not handle for now*/
*info = -3;
else if (nrhs < 0)
*info = -4;
else if (ldda < max(1,m))
*info = -6;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
return *info;
}
/*
* Allocate temporary buffers
*/
int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
if (nb < nrhs)
ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) {
return *info;
}
tau = (cuDoubleComplex*) malloc( k * sizeof(cuDoubleComplex) );
if( tau == NULL ) {
magma_free( dT );
return *info;
}
magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info );
if ( *info == 0 ) {
magma_zgeqrs_gpu( m, n, nrhs,
dA, ldda, tau, dT,
dB, lddb, hwork, lwork, info );
}
magma_free( dT );
free(tau);
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgeqlf ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgeqlf.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQLF computes a QL factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * L.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, if m >= n, the lower triangle of the subarray
A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L;
if m <= n, the elements on and below the (n-m)-th
superdiagonal contain the M-by-N lower trapezoidal matrix L;
the remaining elements, with the array TAU, represent the
orthogonal matrix Q as a product of elementary reflectors
(see Further Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB can be obtained
through magma_get_zgeqlf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in
A(1:m-k+i-1,n-k+i), and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
cuDoubleComplex *da, *dwork;
cuDoubleComplex c_one = MAGMA_Z_ONE;
magma_int_t i, k, lddwork, old_i, old_ib, nb;
magma_int_t rows, cols;
magma_int_t ib, ki, kk, mu, nu, iinfo, ldda;
long int lquery;
*info = 0;
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
}
if (*info == 0) {
k = min(m,n);
if (k == 0)
work[0] = c_one;
else {
work[0] = MAGMA_Z_MAKE( n*nb, 0 );
}
if (lwork < max(1,n) && ! lquery)
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
if (k == 0)
return *info;
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) {
return *info;
}
dwork = da + ldda*(n);
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially.
The last kk columns are handled by the block method.
First, copy the matrix on the GPU except the last kk columns */
magma_zsetmatrix_async( (m), (n-nb),
a_ref(0, 0), lda,
da_ref(0, 0), ldda, stream[0] );
ki = ((k - nb - 1) / nb) * nb;
kk = min(k, ki + nb);
for (i = k - kk + ki; i >= k -kk; i -= nb) {
ib = min(k-i,nb);
if (i< k - kk + ki){
/* 1. Copy asynchronously the current panel to the CPU.
2. Copy asynchronously the submatrix below the panel
to the CPU) */
rows = m - k + i + ib;
da_ref(0, n-k+i), ldda,
a_ref(0, n-k+i), lda, stream[1] );
magma_zgetmatrix_async( (m-rows), ib,
da_ref(rows, n-k+i), ldda,
a_ref(rows, n-k+i), lda, stream[0] );
/* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
two steps - implementing the lookahead techniques.
This is the main update from the lookahead techniques. */
rows = m - k + old_i + old_ib;
cols = n - k + old_i - old_ib;
rows, cols, old_ib,
da_ref(0, cols+old_ib), ldda, dwork, lddwork,
da_ref(0, 0 ), ldda, dwork+old_ib, lddwork);
}
magma_queue_sync( stream[1] );
/* Compute the QL factorization of the current block
A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */
rows = m - k + i + ib;
cols = n - k + i;
lapackf77_zgeqlf(&rows,&ib, a_ref(0,cols), &lda, tau+i, work, &lwork, &iinfo);
if (cols > 0) {
/* Form the triangular factor of the block reflector
H = H(i+ib-1) . . . H(i+1) H(i) */
&rows, &ib,
a_ref(0, cols), &lda, tau + i, work, &ib);
zpanel_to_q( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib);
magma_zsetmatrix( rows, ib,
a_ref(0,cols), lda,
da_ref(0,cols), ldda );
zq_to_panel( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib);
// Send the triangular part on the GPU
magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork );
/* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
two steps - implementing the lookahead techniques.
This is the update of first ib columns. */
if (i-ib >= k -kk)
rows, ib, ib,
da_ref(0, cols), ldda, dwork, lddwork,
da_ref(0,cols-ib), ldda, dwork+ib, lddwork);
else{
rows, cols, ib,
da_ref(0, cols), ldda, dwork, lddwork,
da_ref(0, 0 ), ldda, dwork+ib, lddwork);
}
old_i = i;
old_ib = ib;
}
}
mu = m - k + i + nb;
nu = n - k + i + nb;
magma_zgetmatrix( m, nu, da_ref(0,0), ldda, a_ref(0,0), lda );
} else {
mu = m;
nu = n;
}
/* Use unblocked code to factor the last or only block */
if (mu > 0 && nu > 0)
lapackf77_zgeqlf(&mu, &nu, a_ref(0,0), &lda, tau, work, &lwork, &iinfo);
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_zgeqlf */

Here is the caller graph for this function:

magma_int_t magma_zgeqrf ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgeqrf.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_zgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
cuDoubleComplex *da, *dwork;
cuDoubleComplex c_one = MAGMA_Z_ONE;
int i, k, lddwork, old_i, old_ib;
int ib, ldda;
/* Function Body */
*info = 0;
int nb = magma_get_zgeqrf_nb(min(m, n));
int lwkopt = n * nb;
work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return *info;
}
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
magma_int_t num_gpus = 1;
char * num_gpus_char = getenv("MAGMA_NUM_GPUS");
if (num_gpus_char != NULL ) num_gpus = atoi(num_gpus_char);
if( num_gpus > 1 ) {
/* call multiple-GPU interface */
return magma_zgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info);
}
if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) {
/* Switch to the "out-of-core" (out of GPU-memory) version */
return magma_zgeqrf_ooc(m, n, a, lda, tau, work, lwork, info);
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
dwork = da + ldda*(n);
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
magma_zsetmatrix_async( (m), (n-nb),
a_ref(0,nb), lda,
da_ref(0,nb), ldda, stream[0] );
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
if (i>0){
da_ref(i,i), ldda,
a_ref(i,i), lda, stream[1] );
da_ref(0,i), ldda,
a_ref(0,i), lda, stream[0] );
/* Apply H' to A(i:m,i+2*ib:n) from the left */
m-old_i, n-old_i-2*old_ib, old_ib,
da_ref(old_i, old_i), ldda, dwork, lddwork,
da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
}
magma_queue_sync( stream[1] );
int rows = m-i;
lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib, a_ref(i,i), &lda, tau+i, work, &ib);
zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib);
magma_zsetmatrix( rows, ib, a_ref(i,i), lda, da_ref(i,i), ldda );
zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib);
if (i + ib < n) {
magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork );
if (i+ib < k-nb)
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
else
rows, n-i-ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
if (i!=0)
magma_zgetmatrix( m, ib, da_ref(0,i), ldda, a_ref(0,i), lda );
int rows = m-i;
lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_zgeqrf */

Here is the caller graph for this function:

magma_int_t magma_zgeqrf2 ( magma_context cntxt,
magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 73 of file zgeqrf-v2.cpp.

References __func__, a_ref, da_ref, dwork, magma_qr_params::flag, magma_qr_params::ib, lapackf77_zgeqrf, lapackf77_zlarft, MAGMA_ERR_ILLEGAL_VALUE, MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_MAKE, MAGMA_Z_ONE, magma_zgeqrf_mc(), magma_zlarfb_gpu(), MagmaColumnwise, MagmaColumnwiseStr, MagmaConjTrans, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaUpper, max, min, magma_qr_params::nb, context::nb, magma_qr_params::p, context::params, magma_qr_params::t, magma_qr_params::w, zpanel_to_q(), and zq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
Arguments
=========
CNTXT (input) MAGMA_CONTEXT
CNTXT specifies the MAGMA hardware context for this routine.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using cudaMallocHost.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using cudaMallocHost.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_zgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
if INFO = -8, the GPU memory allocation failed
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
int cnt=-1;
cuDoubleComplex c_one = MAGMA_Z_ONE;
int i, k, lddwork, old_i, old_ib;
int nbmin, nx, ib, ldda;
*info = 0;
magma_qr_params *qr_params = (magma_qr_params *)cntxt->params;
int nb = qr_params->nb;
int lwkopt = n * nb;
work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
}
else if (lquery)
return MAGMA_SUCCESS;
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return MAGMA_SUCCESS;
}
cublasStatus status;
static cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
nbmin = 2;
nx = nb;
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
cuDoubleComplex *da;
status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da);
if (status != CUBLAS_STATUS_SUCCESS) {
*info = -8;
return 0;
}
cuDoubleComplex *dwork = da + ldda*(n);
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex),
a_ref(0,nb), lda *sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*(m), (n-nb),
cudaMemcpyHostToDevice,stream[0]);
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb) {
ib = min(k-i, nb);
if (i>0){
cudaMemcpy2DAsync( a_ref(i,i), lda *sizeof(cuDoubleComplex),
da_ref(i,i), ldda*sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*(m-i), ib,
cudaMemcpyDeviceToHost,stream[1]);
cudaMemcpy2DAsync( a_ref(0,i), lda *sizeof(cuDoubleComplex),
da_ref(0,i), ldda*sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*i, ib,
cudaMemcpyDeviceToHost,stream[0]);
/* Apply H' to A(i:m,i+2*ib:n) from the left */
m-old_i, n-old_i-2*old_ib, old_ib,
da_ref(old_i, old_i), ldda, dwork, lddwork,
da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
}
cudaStreamSynchronize(stream[1]);
int rows = m-i;
cnt++;
cntxt->nb = qr_params->ib;
magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda,
tau+i, work, &lwork, info);
cntxt->nb = nb;
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib);
if (cnt < qr_params->np_gpu) {
qr_params->p[cnt]=a;
}
zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex),
a_ref(i,i), lda, da_ref(i,i), ldda);
if (qr_params->flag == 1)
zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
if (i + ib < n) {
cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork);
if (i+ib < k-nx)
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
else
rows, n-i-ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k)
{
ib = n-i;
if (i!=0)
cublasGetMatrix(m, ib, sizeof(cuDoubleComplex),
da_ref(0,i), ldda, a_ref(0,i), lda);
int rows = m-i;
cnt++;
lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
if (cnt < qr_params->np_gpu)
{
int ib2=min(ib,nb);
&rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2);
qr_params->p[cnt]=a;
}
}
cudaStreamDestroy( stream[0] );
cudaStreamDestroy( stream[1] );
cublasFree(da);
return MAGMA_SUCCESS;
} /* magma_zgeqrf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zgeqrf2_gpu ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
magma_int_t info 
)

Definition at line 14 of file zgeqrf2_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
A = Q * R.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array dA. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1))
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
cuDoubleComplex *dwork;
cuDoubleComplex *work;
int i, k, ldwork, lddwork, old_i, old_ib, rows;
int nbmin, nx, ib, nb;
int lhwork, lwork;
/* Function Body */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = min(m,n);
if (k == 0)
return *info;
lwork = (m+n) * nb;
lhwork = lwork - (m)*nb;
if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (n)*nb )) {
return *info;
}
if (MAGMA_SUCCESS != magma_zmalloc_host( &work, lwork )) {
magma_free( dwork );
return *info;
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
nbmin = 2;
nx = nb;
ldwork = m;
lddwork= n;
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb) {
ib = min(k-i, nb);
rows = m -i;
dA(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
m-old_i, n-old_i-2*old_ib, old_ib,
dA(old_i, old_i ), ldda, dwork, lddwork,
dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
magma_zsetmatrix_async( old_ib, old_ib,
work_ref(old_i), ldwork,
dA(old_i, old_i), ldda, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda );
zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
if (i + ib < n) {
magma_zsetmatrix( ib, ib, hwork, ib, dwork, lddwork );
if (i+nb < k-nx)
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
dA(i, i ), ldda, dwork, lddwork,
dA(i, i+ib), ldda, dwork+ib, lddwork);
else {
rows, n-i-ib, ib,
dA(i, i ), ldda, dwork, lddwork,
dA(i, i+ib), ldda, dwork+ib, lddwork);
work_ref(i), ldwork,
dA(i,i), ldda );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
magma_free( dwork );
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda );
}
magma_free_host( work );
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
return *info;
} /* magma_zgeqrf2_gpu */

Here is the caller graph for this function:

magma_int_t magma_zgeqrf3 ( magma_context cntxt,
magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 73 of file zgeqrf-v3.cpp.

References __func__, magma_qr_params::fb, magma_qr_params::flag, magma_qr_params::m, MAGMA_ERR_ILLEGAL_VALUE, MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_MAKE, MAGMA_Z_ONE, magma_zgeqrf2(), MagmaUpper, max, min, magma_qr_params::n, magma_qr_params::nb, magma_qr_params::np_gpu, magma_qr_params::nthreads, magma_qr_params::ob, magma_qr_params::p, context::params, magma_qr_params::sync0, magma_qr_params::sync1, magma_qr_params::sync2, magma_qr_params::w, and zq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
==================================================================== */
cuDoubleComplex c_one = MAGMA_Z_ONE;
int k, ib;
magma_qr_params *qr_params = (magma_qr_params *)cntxt->params;
*info = 0;
int lwkopt = n * qr_params->nb;
work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
}
else if (lquery)
return MAGMA_SUCCESS;
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return MAGMA_SUCCESS;
}
int M=qr_params->nthreads*qr_params->ob;
int N=qr_params->nthreads*qr_params->ob;
if (qr_params->m > qr_params->n)
M = qr_params->m - (qr_params->n-qr_params->nthreads*qr_params->ob);
/* Use MAGMA code to factor left portion of matrix, waking up threads
along the way to perform updates on the right portion of matrix */
magma_zgeqrf2(cntxt, m, n - qr_params->nthreads * qr_params->ob,
a, lda, tau, work, lwork, info);
/* Wait for all update threads to finish */
for (k = 0; k < qr_params->nthreads; k++) {
while (qr_params->sync1[k] == 0) {
sched_yield();
}
}
/* Unzero upper part of each panel */
for (k = 0; k < qr_params->np_gpu-1; k++){
ib = min(qr_params->nb,(n-qr_params->nthreads*qr_params->ob)-qr_params->nb*k);
zq_to_panel(MagmaUpper, ib, a + k*qr_params->nb*lda + k*qr_params->nb, lda,
qr_params->w+qr_params->nb*qr_params->nb*k);
}
/* Use final blocking size */
qr_params->nb = qr_params->fb;
/* Flag MAGMA code to internally unzero upper part of each panel */
qr_params->flag = 1;
/* Use MAGMA code to perform final factorization if necessary */
if (qr_params->m > (qr_params->n - (qr_params->nthreads*qr_params->ob)))
if (M > (qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads))))
M = qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads));
magma_zgeqrf2(cntxt, M, N,
a + (n-qr_params->nthreads*qr_params->ob)*m+
(n-qr_params->nthreads*qr_params->ob), lda,
&tau[n-qr_params->nthreads*qr_params->ob],
work, lwork, info);
/* Prepare for next run */
for (k = 0; k < qr_params->np_gpu; k++) {
qr_params->p[k] = NULL;
}
for (k = 0; k < qr_params->nthreads; k++) {
qr_params->sync1[k] = 0;
}
/* Infrastructure for next run is not in place yet */
qr_params->sync0 = 0;
/* Signal update threads to get in position for next run */
qr_params->sync2 = 1;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zgeqrf3_gpu ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
cuDoubleComplex *  dT,
magma_int_t info 
)

Definition at line 38 of file zgeqrf3_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF3 computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R. This version stores the triangular matrices T used in
the block QR factorization so that Q can be applied directly (i.e.,
without being recomputed) later. As a result, the application
of Q is much faster. The upper triangular matrices for V have 0s
in them and the corresponding parts of the upper triangular R are
stored separately.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
dT (workspace/output) COMPLEX_16 array on the GPU,
dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
where NB can be obtained through magma_get_zgeqrf_nb(M).
It starts with MIN(M,N)*NB block that store the triangular T
matrices, followed by the MIN(M,N)*NB block of the diagonal
matrices for the R matrix. The rest of the array is used as workspace.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define t_ref(a_1) (dT+(a_1)*nb)
#define d_ref(a_1) (dT+(minmn+(a_1))*nb)
#define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb)
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
magma_int_t ib, nb;
magma_int_t ldwork, lddwork, lwork, lhwork;
cuDoubleComplex *work, *ut;
/* check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = minmn = min(m,n);
if (k == 0)
return *info;
lwork = (m + n + nb)*nb;
lhwork = lwork - m*nb;
if (MAGMA_SUCCESS != magma_zmalloc_host( &work, lwork )) {
return *info;
}
ut = hwork+nb*(n);
memset( ut, 0, nb*nb*sizeof(cuDoubleComplex));
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
ldwork = m;
lddwork= n;
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
rows = m -i;
a_ref(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
m-old_i, cols, old_ib,
a_ref(old_i, old_i ), ldda, t_ref(old_i), nb,
a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork);
/* store the diagonal */
magma_zsetmatrix_async( old_ib, old_ib,
ut, old_ib,
d_ref(old_i), old_ib, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
/* Put 0s in the upper triangular part of a panel (and 1s on the
diagonal); copy the upper triangular in ut. */
magma_queue_sync( stream[0] );
zsplit_diag_block3(ib, work_ref(i), ldwork, ut);
magma_zsetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );
if (i + ib < n) {
/* Send the triangular factor T to the GPU */
magma_zsetmatrix( ib, ib, hwork, ib, t_ref(i), nb );
if (i+nb < k-nb){
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
}
else {
cols = n-i-ib;
rows, cols, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
/* Fix the diagonal block */
magma_zsetmatrix( ib, ib, ut, ib, d_ref(i), ib );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_zgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_zsetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free_host( work );
return *info;
/* End of MAGMA_ZGEQRF */
} /* magma_zgeqrf */

Here is the caller graph for this function:

magma_int_t magma_zgeqrf_gpu ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
cuDoubleComplex *  dT,
magma_int_t info 
)

Definition at line 41 of file zgeqrf_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R. This version stores the triangular matrices used in
the factorization so that they can be applied directly (i.e.,
without being recomputed) later. As a result, the application
of Q is much faster.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
dT (workspace/output) COMPLEX_16 array on the GPU,
dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
where NB can be obtained through magma_get_zgeqrf_nb(M).
It starts with MIN(M,N)*NB block that store the triangular T
matrices, followed by the MIN(M,N)*NB block of the diagonal
inverses for the R matrix. The rest of the array is used as workspace.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define t_ref(a_1) (dT+(a_1)*nb)
#define d_ref(a_1) (dT+(minmn+(a_1))*nb)
#define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb)
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
magma_int_t ib, nb;
magma_int_t ldwork, lddwork, lwork, lhwork;
cuDoubleComplex *work, *ut;
/* check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = minmn = min(m,n);
if (k == 0)
return *info;
lwork = (m + n + nb)*nb;
lhwork = lwork - m*nb;
if (MAGMA_SUCCESS != magma_zmalloc_host( &work, lwork )) {
return *info;
}
ut = hwork+nb*(n);
memset( ut, 0, nb*nb*sizeof(cuDoubleComplex));
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
ldwork = m;
lddwork= n;
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
rows = m -i;
a_ref(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
m-old_i, cols, old_ib,
a_ref(old_i, old_i ), ldda, t_ref(old_i), nb,
a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork);
/* store the diagonal */
magma_zsetmatrix_async( old_ib, old_ib,
ut, old_ib,
d_ref(old_i), old_ib, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
/* Put 0s in the upper triangular part of a panel (and 1s on the
diagonal); copy the upper triangular in ut and invert it */
magma_queue_sync( stream[0] );
zsplit_diag_block(ib, work_ref(i), ldwork, ut);
magma_zsetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );
if (i + ib < n) {
/* Send the triangular factor T to the GPU */
magma_zsetmatrix( ib, ib, hwork, ib, t_ref(i), nb );
if (i+nb < k-nb){
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
}
else {
cols = n-i-ib;
rows, cols, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
/* Fix the diagonal block */
magma_zsetmatrix( ib, ib, ut, ib, d_ref(i), ib );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_zgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_zsetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free_host( work );
return *info;
/* End of MAGMA_ZGEQRF */
} /* magma_zgeqrf */

Here is the caller graph for this function:

magma_int_t magma_zgeqrf_mc ( magma_context cntxt,
magma_int_t m,
magma_int_t n,
cuDoubleComplex *  A,
magma_int_t lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t lwork,
magma_int_t info 
)

Definition at line 361 of file zgeqrf_mc.cpp.

References __func__, A, MAGMA_ERR_ILLEGAL_VALUE, magma_get_zpotrf_nb(), magma_xerbla(), MAGMA_Z_MAKE, MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, max, min, context::nb, context::num_cores, context::num_gpus, context::quark, QUARK_Barrier(), QUARK_Insert_Task_zgemm(), QUARK_Insert_Task_zgeqrt(), QUARK_Insert_Task_zlarfb(), QUARK_Insert_Task_ztrmm(), T, and W.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
A = Q * R.
Arguments
=========
CNTXT (input) MAGMA_CONTEXT
CNTXT specifies the MAGMA hardware context for this routine.
M (input) magma_int_tEGER
The number of rows of the matrix A. M >= 0.
N (input) magma_int_tEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDA (input) magma_int_tEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) magma_int_tEGER
The dimension of the array WORK. LWORK >= N*NB.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) magma_int_tEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
==================================================================== */
if (cntxt->num_cores == 1 && cntxt->num_gpus == 1)
{
//magma_int_t result = magma_zgeqrf(*m, *n, a, *lda, tau, work, *lwork, info);
//return result;
}
magma_int_t i,j,l;
magma_int_t ii=-1,jj=-1,ll=-1;
Quark* quark = cntxt->quark;
// DAG labels
char sgeqrt_dag_label[1000];
char slarfb_dag_label[1000];
char strmm_dag_label[1000];
char sgemm_dag_label[1000];
*info = 0;
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magma_int_t nb = (cntxt->nb ==-1)? magma_get_zpotrf_nb(*n): cntxt->nb;
magma_int_t lwkopt = *n * nb;
work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
long int lquery = *lwork == -1;
// check input arguments
if (*m < 0) {
*info = -1;
} else if (*n < 0) {
*info = -2;
} else if (*lda < max(1,*m)) {
*info = -4;
} else if (*lwork < max(1,*n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
}
else if (lquery)
return 0;
magma_int_t k = min(*m,*n);
if (k == 0) {
work[0] = c_one;
return 0;
}
magma_int_t nt = (((*n)%nb) == 0) ? (*n)/nb : (*n)/nb + 1;
magma_int_t mt = (((*m)%nb) == 0) ? (*m)/nb : (*m)/nb + 1;
cuDoubleComplex **local_work = (cuDoubleComplex**) malloc(sizeof(cuDoubleComplex*)*(nt-1)*mt);
memset(local_work, 0, sizeof(cuDoubleComplex*)*(nt-1)*mt);
magma_int_t priority;
// traverse diagonal blocks
for (i = 0; i < k; i += nb) {
ii++;
jj = ii;
sprintf(sgeqrt_dag_label, "GEQRT %d",ii);
// factor diagonal block, also compute T matrix
0, (*m)-i, min(nb,(*n)-i), A(i,i), *lda, T(i), nb, &tau[i], sgeqrt_dag_label);
if (i > 0) {
priority = 100;
// update panels in a left looking fashion
for (j = (i-nb) + (2*nb); j < *n; j += nb) {
jj++;
ll = ii-1;
sprintf(slarfb_dag_label, "LARFB %d %d",ii-1, jj);
// perform part of update
(*m)-(i-nb), min(nb,(*n)-(i-nb)), min(nb,(*m)-(i-nb)), min(nb,(*n)-j), nb,
A(i-nb,i-nb), *lda, A(i-nb,j), *lda, T(i-nb), nb, W(ii-1,jj), nb, slarfb_dag_label, priority);
sprintf(strmm_dag_label, "TRMM %d %d",ii-1, jj);
// perform more of update
QUARK_Insert_Task_ztrmm(quark, 0, min(nb,(*m)-(i-nb)), min(nb,(*n)-j), c_neg_one,
A(i-nb,i-nb), *lda, W(ii-1,jj), nb, c_one, A(i-nb,j), *lda, strmm_dag_label, priority);
sprintf(sgemm_dag_label, "GEMM %d %d %d",ii-1, jj, ll);
// finish update
QUARK_Insert_Task_zgemm(quark, 0, (*m)-i, min(nb,(*n)-j), min(nb,(*n)-(i-nb)), c_neg_one,
A(i,i-nb), *lda, W(ii-1,jj), nb, c_one, A(i,j), *lda, A(i,j), sgemm_dag_label, priority, jj);
}
}
j = i + nb;
jj = ii;
// handle case of short wide rectangular matrix
if (j < (*n)) {
priority = 0;
jj++;
ll = ii;
sprintf(slarfb_dag_label, "LARFB %d %d",ii, jj);
// perform part of update
(*m)-i, min(nb,(*n)-i), min(nb,(*m)-i), min(nb,(*n)-j), nb,
A(i,i), *lda, A(i,j), *lda, T(i), nb, W(ii,jj), nb, slarfb_dag_label, priority);
sprintf(strmm_dag_label, "TRMM %d %d",ii, jj);
// perform more of update
QUARK_Insert_Task_ztrmm(quark, 0, min(nb,(*m)-i), min(nb,(*n)-j), c_neg_one,
A(i,i), *lda, W(ii,jj), nb, c_one, A(i,j), *lda, strmm_dag_label, priority);
sprintf(sgemm_dag_label, "GEMM %d %d %d",ii, jj, ll);
// finish update
QUARK_Insert_Task_zgemm(quark, 0, (*m)-i-nb, min(nb,(*n)-j), min(nb,(*n)-i), c_neg_one,
A(i+nb,i), *lda, W(ii,jj), nb, c_one, A(i+nb,j), *lda, A(i+nb,j), sgemm_dag_label, priority, jj);
}
}
// wait for all tasks to finish executing
QUARK_Barrier(quark);
// free memory
for(k = 0 ; k < (nt-1)*mt; k++) {
if (local_work[k] != NULL) {
free(local_work[k]);
}
}
free(local_work);
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_zgeqrf_ooc ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file zgeqrf_ooc.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGEQRF_OOC computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
A = Q * R. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
This is an out-of-core (ooc) version that is similar to magma_zgeqrf but
the difference is that this version can use a GPU even if the matrix
does not fit into the GPU memory at once.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) COMPLEX_16 array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_zgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a complex scalar, and v is a complex vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
cuDoubleComplex *da, *dwork;
cuDoubleComplex c_one = MAGMA_Z_ONE;
int k, lddwork, ldda;
*info = 0;
int nb = magma_get_zgeqrf_nb(min(m, n));
int lwkopt = n * nb;
work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Check how much memory do we have */
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(cuDoubleComplex);
magma_int_t IB, NB = (magma_int_t)(0.8*totalMem/m);
NB = (NB / nb) * nb;
if (NB >= n)
return magma_zgeqrf(m, n, a, lda, tau, work, lwork, info);
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return *info;
}
lddwork = ((NB+31)/32)*32+nb;
ldda = ((m+31)/32)*32;
if (MAGMA_SUCCESS != magma_zmalloc( &da, (NB + nb)*ldda + nb*lddwork )) {
return *info;
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
// magmablasSetKernelStream(stream[1]);
cuDoubleComplex *ptr = da + ldda * NB;
dwork = da + ldda*(NB + nb);
/* start the main loop over the blocks that fit in the GPU memory */
for(int i=0; i<n; i+=NB)
{
IB = min(n-i, NB);
//printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB);
/* 1. Copy the next part of the matrix to the GPU */
a_ref(0,i), lda,
da_ref(0,0), ldda, stream[0] );
magma_queue_sync( stream[0] );
/* 2. Update it with the previous transformations */
for(int j=0; j<min(i,k); j+=nb)
{
magma_int_t ib = min(k-j, nb);
/* Get a panel in ptr. */
// 1. Form the triangular factor of the block reflector
// 2. Send it to the GPU.
// 3. Put 0s in the upper triangular part of V.
// 4. Send V to the GPU in ptr.
// 5. Update the matrix.
// 6. Restore the upper part of V.
int rows = m-j;
&rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib);
work, ib,
dwork, lddwork, stream[1] );
zpanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
a_ref(j,j), lda,
ptr, rows, stream[1] );
magma_queue_sync( stream[1] );
rows, IB, ib,
ptr, rows, dwork, lddwork,
da_ref(j, 0), ldda, dwork+ib, lddwork);
zq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
}
/* 3. Do a QR on the current part */
if (i<k)
magma_zgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info);
/* 4. Copy the current part back to the CPU */
da_ref(0,0), ldda,
a_ref(0,i), lda, stream[0] );
}
magma_queue_sync( stream[0] );
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_zgeqrf_ooc */

Here is the caller graph for this function:

magma_int_t magma_zgeqrs3_gpu ( magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
cuDoubleComplex *  dT,
cuDoubleComplex *  dB,
magma_int_t  lddb,
cuDoubleComplex *  hwork,
magma_int_t  lhwork,
magma_int_t info 
)

Definition at line 14 of file zgeqrs3_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the least squares problem
min || A*X - C ||
using the QR factorization A = Q*R computed by ZGEQRF3_GPU.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input) COMPLEX_16 array on the GPU, dimension (LDDA,N)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,n, as returned by
ZGEQRF3_GPU in the first n columns of its array argument A.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
TAU (input) COMPLEX_16 array, dimension (N)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by MAGMA_ZGEQRF_GPU.
DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
DT (input) COMPLEX_16 array that is the output (the 6th argument)
of magma_zgeqrf_gpu of size
2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
The array starts with a block of size MIN(M,N)*NB that stores
the triangular T matrices used in the QR factorization,
followed by MIN(M,N)*NB block storing the diagonal block
matrices for the R matrix, followed by work space of size
((N+31)/32*32 )* MAX(NB, NRHS).
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) COMPLEX_16 array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_zgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the WORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define d_ref(a_1) (dT+(lddwork+(a_1))*nb)
cuDoubleComplex c_one = MAGMA_Z_ONE;
magma_int_t k, lddwork;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0 || m < n)
*info = -2;
else if (nrhs < 0)
*info = -3;
else if (ldda < max(1,m))
*info = -5;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
hwork[0] = c_one;
return *info;
}
lddwork= k;
/* B := Q' * B */
m, nrhs, n,
a_ref(0,0), ldda, tau,
dB, lddb, hwork, lwork, dT, nb, info );
if ( *info != 0 ) {
return *info;
}
/* Solve R*X = B(1:n,:)
1. Move the block diagonal submatrices from d_ref to R
2. Solve
3. Restore the data format moving data from R back to d_ref
*/
magmablas_zswapdblk(k, nb, a_ref(0,0), ldda, 1, d_ref(0), nb, 0);
if ( nrhs == 1 ) {
n, a_ref(0,0), ldda, dB, 1);
} else {
n, nrhs, c_one, a_ref(0,0), ldda, dB, lddb);
}
magmablas_zswapdblk(k, nb, d_ref(0), nb, 0, a_ref(0,0), ldda, 1);
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgeqrs_gpu ( magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  dA,
magma_int_t  ldda,
cuDoubleComplex *  tau,
cuDoubleComplex *  dT,
cuDoubleComplex *  dB,
magma_int_t  lddb,
cuDoubleComplex *  hwork,
magma_int_t  lhwork,
magma_int_t info 
)

Definition at line 14 of file zgeqrs_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the least squares problem
min || A*X - C ||
using the QR factorization A = Q*R computed by ZGEQRF_GPU.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input) COMPLEX_16 array on the GPU, dimension (LDDA,N)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,n, as returned by
ZGEQRF_GPU in the first n columns of its array argument A.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
TAU (input) COMPLEX_16 array, dimension (N)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by MAGMA_ZGEQRF_GPU.
DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
DT (input) COMPLEX_16 array that is the output (the 6th argument)
of magma_zgeqrf_gpu of size
2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
The array starts with a block of size MIN(M,N)*NB that stores
the triangular T matrices used in the QR factorization,
followed by MIN(M,N)*NB block storing the diagonal block
inverses for the R matrix, followed by work space of size
((N+31)/32*32 )* MAX(NB, NRHS).
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) COMPLEX_16 array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_zgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the WORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define d_ref(a_1) (dT+(lddwork+(a_1))*nb)
cuDoubleComplex c_zero = MAGMA_Z_ZERO;
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
cuDoubleComplex *dwork;
magma_int_t i, k, lddwork, rows, ib;
magma_int_t ione = 1;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0 || m < n)
*info = -2;
else if (nrhs < 0)
*info = -3;
else if (ldda < max(1,m))
*info = -5;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
hwork[0] = c_one;
return *info;
}
/* B := Q' * B */
m, nrhs, n,
a_ref(0,0), ldda, tau,
dB, lddb, hwork, lwork, dT, nb, info );
if ( *info != 0 ) {
return *info;
}
/* Solve R*X = B(1:n,:) */
lddwork= k;
if (nb < k)
dwork = dT+2*lddwork*nb;
// To do: Why did we have this line originally; seems to be a bug (Stan)?
// dwork = dT;
i = (k-1)/nb * nb;
ib = n-i;
rows = m-i;
if ( nrhs == 1 ) {
&ib, hwork, &rows,
hwork+rows*ib, &ione);
} else {
&ib, &nrhs,
&c_one, hwork, &rows,
hwork+rows*ib, &rows);
}
// update the solution vector
magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork );
// update c
if (nrhs == 1)
c_neg_one, a_ref(0, i), ldda,
dwork + i, 1,
c_one, dB, 1);
else
i, nrhs, ib,
c_neg_one, a_ref(0, i), ldda,
dwork + i, lddwork,
c_one, dB, lddb);
int start = i-nb;
if (nb < k) {
for (i = start; i >=0; i -= nb) {
ib = min(k-i, nb);
rows = m -i;
if (i + ib < n) {
if (nrhs == 1)
{
c_one, d_ref(i), ib,
dB+i, 1,
c_zero, dwork+i, 1);
c_neg_one, a_ref(0, i), ldda,
dwork + i, 1,
c_one, dB, 1);
}
else
{
ib, nrhs, ib,
c_one, d_ref(i), ib,
dB+i, lddb,
c_zero, dwork+i, lddwork);
i, nrhs, ib,
c_neg_one, a_ref(0, i), ldda,
dwork + i, lddwork,
c_one, dB, lddb);
}
}
}
}
magma_zcopymatrix( (n), nrhs,
dwork, lddwork,
dB, lddb );
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgessm_gpu ( char  storev,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magma_int_t  ib,
magma_int_t ipiv,
cuDoubleComplex *  dL1,
magma_int_t  lddl1,
cuDoubleComplex *  dL,
magma_int_t  lddl,
cuDoubleComplex *  dA,
magma_int_t  ldda,
magma_int_t info 
)

Definition at line 21 of file zgessm_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array on the GPU, dimension (LDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define AT(i,j) (dAT + (i)*ldda + (j) )
#define L(i,j) (dL + (i) + (j)*lddl )
#define dL1(j) (dL1 + (j)*lddl1)
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
int i, s, sb;
cuDoubleComplex *dAT;
/* Check arguments */
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (ldda < max(1,m))
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
if ( (storev == 'C') || (storev == 'c') ) {
magmablas_zgetmo_in( dA, dAT, ldda, m, n );
} else {
dAT = dA;
}
s = k / ib;
for(i = 0; i < k; i += ib) {
sb = min(ib, k-i);
magmablas_zlaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 );
#ifndef WITHOUTTRTRI
n, sb,
c_one, dL1(i), lddl1,
AT(i, 0), ldda);
#else
n, sb,
c_one, L( i, i), lddl,
AT(i, 0), ldda);
#endif
if ( (i+sb) < m) {
n, m-(i+sb), sb,
c_neg_one, AT(i, 0), ldda,
L( i+sb, i), lddl,
c_one, AT(i+sb, 0), ldda );
}
}
if ( (storev == 'C') || (storev == 'c') ) {
magmablas_zgetmo_in( dA, dAT, ldda, m, n );
}
return *info;
/* End of MAGMA_ZGETRF_GPU */
}

Here is the caller graph for this function:

magma_int_t magma_zgesv ( magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  A,
magma_int_t  lda,
magma_int_t ipiv,
cuDoubleComplex *  B,
magma_int_t  ldb,
magma_int_t info 
)

Definition at line 14 of file zgesv.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves a system of linear equations
A * X = B
where A is a general N-by-N matrix and X and B are N-by-NRHS matrices.
The LU decomposition with partial pivoting and row interchanges is
used to factor A as
A = P * L * U,
where P is a permutation matrix, L is unit lower triangular, and U is
upper triangular. The factored form of A is then used to solve the
system of equations A * X = B.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
B (input/output) COMPLEX_16 array on the GPU, dimension (LDB,NRHS)
On entry, the right hand side matrix B.
On exit, the solution matrix X.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
*info = 0;
if (n < 0) {
*info = -1;
} else if (nrhs < 0) {
*info = -2;
} else if (lda < max(1,n)) {
*info = -4;
} else if (ldb < max(1,n)) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (n == 0 || nrhs == 0) {
return *info;
}
magma_zgetrf( n, n, A, lda, ipiv, info );
if ( *info == 0 ) {
lapackf77_zgetrs( MagmaNoTransStr, &n, &nrhs, A, &lda, ipiv, B, &ldb, info );
}
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgesv_gpu ( magma_int_t  n,
magma_int_t  nrhs,
cuDoubleComplex *  dA,
magma_int_t  ldda,
magma_int_t ipiv,
cuDoubleComplex *  dB,
magma_int_t  lddb,
magma_int_t info 
)

Definition at line 21 of file zgesv_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves a system of linear equations
A * X = B
where A is a general N-by-N matrix and X and B are N-by-NRHS matrices.
The LU decomposition with partial pivoting and row interchanges is
used to factor A as
A = P * L * U,
where P is a permutation matrix, L is unit lower triangular, and U is
upper triangular. The factored form of A is then used to solve the
system of equations A * X = B.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
B (input/output) COMPLEX_16 array on the GPU, dimension (LDB,NRHS)
On entry, the right hand side matrix B.
On exit, the solution matrix X.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
*info = 0;
if (n < 0) {
*info = -1;
} else if (nrhs < 0) {
*info = -2;
} else if (ldda < max(1,n)) {
*info = -4;
} else if (lddb < max(1,n)) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (n == 0 || nrhs == 0) {
return *info;
}
magma_zgetrf_gpu( n, n, dA, ldda, ipiv, info );
if ( *info == 0 ) {
magma_zgetrs_gpu( MagmaNoTrans, n, nrhs, dA, ldda, ipiv, dB, lddb, info );
}
return *info;
}

Here is the caller graph for this function:

magma_int_t magma_zgesvd ( char  jobu,
char  jobvt,
magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
double *  s,
cuDoubleComplex *  u,
magma_int_t  ldu,
cuDoubleComplex *  vt,
magma_int_t  ldvt,
cuDoubleComplex *  work,
magma_int_t  lwork,
double *  rwork,
magma_int_t info 
)

Definition at line 16 of file zgesvd.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGESVD computes the singular value decomposition (SVD) of a complex
M-by-N matrix A, optionally computing the left and/or right singular
vectors. The SVD is written
A = U * SIGMA * conjugate-transpose(V)
where SIGMA is an M-by-N matrix which is zero except for its
min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
V is an N-by-N unitary matrix. The diagonal elements of SIGMA
are the singular values of A; they are real and non-negative, and
are returned in descending order. The first min(m,n) columns of
U and V are the left and right singular vectors of A.
Note that the routine returns V**H, not V.
Arguments
=========
JOBU (input) CHARACTER*1
Specifies options for computing all or part of the matrix U:
= 'A': all M columns of U are returned in array U:
= 'S': the first min(m,n) columns of U (the left singular
vectors) are returned in the array U;
= 'O': the first min(m,n) columns of U (the left singular
vectors) are overwritten on the array A;
= 'N': no columns of U (no left singular vectors) are
computed.
JOBVT (input) CHARACTER*1
Specifies options for computing all or part of the matrix
V**H:
= 'A': all N rows of V**H are returned in the array VT;
= 'S': the first min(m,n) rows of V**H (the right singular
vectors) are returned in the array VT;
= 'O': the first min(m,n) rows of V**H (the right singular
vectors) are overwritten on the array A;
= 'N': no rows of V**H (no right singular vectors) are
computed.
JOBVT and JOBU cannot both be 'O'.
M (input) INTEGER
The number of rows of the input matrix A. M >= 0.
N (input) INTEGER
The number of columns of the input matrix A. N >= 0.
A (input/output) COMPLEX*16 array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit,
if JOBU = 'O', A is overwritten with the first min(m,n)
columns of U (the left singular vectors,
stored columnwise);
if JOBVT = 'O', A is overwritten with the first min(m,n)
rows of V**H (the right singular vectors,
stored rowwise);
if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A
are destroyed.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
S (output) DOUBLE PRECISION array, dimension (min(M,N))
The singular values of A, sorted so that S(i) >= S(i+1).
U (output) COMPLEX*16 array, dimension (LDU,UCOL)
(LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'.
If JOBU = 'A', U contains the M-by-M unitary matrix U;
if JOBU = 'S', U contains the first min(m,n) columns of U
(the left singular vectors, stored columnwise);
if JOBU = 'N' or 'O', U is not referenced.
LDU (input) INTEGER
The leading dimension of the array U. LDU >= 1; if
JOBU = 'S' or 'A', LDU >= M.
VT (output) COMPLEX*16 array, dimension (LDVT,N)
If JOBVT = 'A', VT contains the N-by-N unitary matrix
V**H;
if JOBVT = 'S', VT contains the first min(m,n) rows of
V**H (the right singular vectors, stored rowwise);
if JOBVT = 'N' or 'O', VT is not referenced.
LDVT (input) INTEGER
The leading dimension of the array VT. LDVT >= 1; if
JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N).
WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK.
LWORK >= (M+N)*nb+N.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
RWORK (workspace) DOUBLE PRECISION array, dimension (5*min(M,N))
On exit, if INFO > 0, RWORK(1:MIN(M,N)-1) contains the
unconverged superdiagonal elements of an upper bidiagonal
matrix B whose diagonal is in S (not necessarily sorted).
B satisfies A = U * B * VT, so it has the same singular
values as A, and singular vectors related by U and VT.
INFO (output) INTEGER
= 0: successful exit.
< 0: if INFO = -i, the i-th argument had an illegal value.
> 0: if ZBDSQR did not converge, INFO specifies how many
superdiagonals of an intermediate bidiagonal form B
did not converge to zero. See the description of RWORK
above for details.
===================================================================== */
char jobu_[2] = {jobu, 0};
char jobvt_[2] = {jobvt, 0};
magma_int_t *m = &m_;
magma_int_t *n = &n_;
magma_int_t *lda = &lda_;
magma_int_t *ldu = &ldu_;
magma_int_t *ldvt = &ldvt_;
magma_int_t *lwork = &lwork_;
static cuDoubleComplex c_b1 = MAGMA_Z_ZERO;
static cuDoubleComplex c_b2 = MAGMA_Z_ONE;
static magma_int_t c__0 = 0;
static magma_int_t c__1 = 1;
static magma_int_t c_n1 = -1;
magma_int_t a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset,
i__2, i__3, i__4;
static magma_int_t i__, ie, ir, iu, blk, ncu;
static double dum[1], eps;
static magma_int_t nru;
static cuDoubleComplex cdum[1];
static magma_int_t iscl;
static double anrm;
static magma_int_t ierr, itau, ncvt, nrvt;
static magma_int_t chunk, minmn;
static magma_int_t wrkbl, itaup, itauq, mnthr, iwork;
magma_int_t wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs;
static double bignum;
static magma_int_t ldwrkr;
static magma_int_t ldwrku, maxwrk;
magma_int_t minwrk;
static double smlnum;
static magma_int_t irwork;
magma_int_t lquery, wntuas, wntvas;
*info = 0;
minmn = min(*m,*n);
wntua = lapackf77_lsame(jobu_, "A");
mnthr = (magma_int_t)( (double)(min( m_, n_ )) * 1.6 );
wntus = lapackf77_lsame(jobu_, "S");
wntuas = wntua || wntus;
wntuo = lapackf77_lsame(jobu_, "O");
wntun = lapackf77_lsame(jobu_, "N");
wntva = lapackf77_lsame(jobvt_, "A");
wntvs = lapackf77_lsame(jobvt_, "S");
wntvas = wntva || wntvs;
wntvo = lapackf77_lsame(jobvt_, "O");
wntvn = lapackf77_lsame(jobvt_, "N");
lquery = *lwork == -1;
if (! (wntua || wntus || wntuo || wntun)) {
*info = -1;
} else if (! (wntva || wntvs || wntvo || wntvn) || (wntvo && wntuo) ) {
*info = -2;
} else if (*m < 0) {
*info = -3;
} else if (*n < 0) {
*info = -4;
} else if (*lda < max(1,*m)) {
*info = -6;
} else if ((*ldu < 1) || (wntuas && (*ldu < *m)) ) {
*info = -9;
} else if ((*ldvt < 1) || (wntva && (*ldvt < *n)) || (wntvs && (*ldvt < minmn)) ) {
*info = -11;
}
/* Compute workspace */
lapackf77_zgesvd(jobu_, jobvt_, m, n, a, lda, s, u, ldu,
vt, ldvt, work, &c_n1, rwork, info );
maxwrk = (magma_int_t)MAGMA_Z_REAL(work[0]);
if (*info == 0) {
minwrk = ((*m)+(*n))*nb+(*n);
MAGMA_Z_SET2REAL(work[0], (double) minwrk);
if ( !lquery && (lwork_ < minwrk) ) {
*info = -13;
}
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (*m == 0 || *n == 0) {
return *info;
}
mnthr = (magma_int_t)( (double)(min( m_, n_ )) * 1.6 );
wrkbl = maxwrk; /* Not optimal */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--s;
u_dim1 = *ldu;
u_offset = 1 + u_dim1;
u -= u_offset;
vt_dim1 = *ldvt;
vt_offset = 1 + vt_dim1;
vt -= vt_offset;
--work;
--rwork;
/* Get machine constants */
eps = lapackf77_dlamch("P");
smlnum = magma_dsqrt(lapackf77_dlamch("S")) / eps;
bignum = 1. / smlnum;
/* Scale A if max element outside range [SMLNUM,BIGNUM] */
anrm = lapackf77_zlange("M", m, n, &a[a_offset], lda, dum);
iscl = 0;
if (anrm > 0. && anrm < smlnum) {
iscl = 1;
lapackf77_zlascl("G", &c__0, &c__0, &anrm, &smlnum, m, n,
&a[a_offset], lda, &ierr);
} else if (anrm > bignum) {
iscl = 1;
lapackf77_zlascl("G", &c__0, &c__0, &anrm, &bignum, m, n,
&a[a_offset], lda, &ierr);
}
if (*m >= *n) {
/* A has at least as many rows as columns. If A has sufficiently
more rows than columns, first reduce using the QR
decomposition (if sufficient workspace available) */
if (*m >= mnthr) {
if (wntun) {
/* Path 1 (M much larger than N, JOBU_='N')
No left singular vectors to be computed */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: need 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
/* Zero out below R */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &a[a_dim1 + 2], lda);
ie = 1;
itauq = 1;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in A
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &a[a_offset], lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], &i__2, &ierr);
*/
magma_zgebrd(*n, *n, &a[a_offset], *lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__2, &ierr);
ncvt = 0;
if (wntvo || wntvas) {
/* If right singular vectors desired, generate P'.
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &a[a_offset], lda, &work[itaup], &
work[iwork], &i__2, &ierr);
ncvt = *n;
}
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing right
singular vectors of A in A if desired
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &ncvt, &c__0, &c__0, &s[1], &rwork[ie],
&a[a_offset], lda, cdum, &c__1, cdum, &c__1, &rwork[irwork], info);
/* If right singular vectors desired in VT, copy them there */
if (wntvas) {
lapackf77_zlacpy("F", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
}
} else if (wntuo && wntvn) {
/* Path 2 (M much larger than N, JOBU_='O', JOBVT_='N')
N left singular vectors to be overwritten on A and
no right singular vectors to be computed */
if (*lwork >= *n * *n + *n * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n;
if (*lwork >= max(i__2,i__3) + *lda * *n) {
/* WORK(IU) is LDA by N, WORK(IR) is LDA by N */
ldwrku = *lda;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n;
if (*lwork >= max(i__2,i__3) + *n * *n) {
/* WORK(IU) is LDA by N, WORK(IR) is N by N */
ldwrku = *lda;
ldwrkr = *n;
} else {
/* WORK(IU) is LDWRKU by N, WORK(IR) is N by N */
ldwrku = (*lwork - *n * *n) / *n;
ldwrkr = *n;
}
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy R to WORK(IR) and zero out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[ir + 1], &ldwrkr);
/* Generate Q in A
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &i__2, &
ierr);
*/
magma_zgebrd(*n, *n, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], i__2, &
ierr);
/* Generate left vectors bidiagonalizing R
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: need 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq], &
work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IR)
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, n, &c__0, &s[1], &rwork[ie], cdum,
&c__1, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
iu = itauq;
/* Multiply Q in A by left singular vectors of R in
WORK(IR), storing result in WORK(IU) and copying to A
(CWorkspace: need N*N+N, prefer N*N+M*N)
(RWorkspace: 0) */
i__2 = *m;
i__3 = ldwrku;
for(i__ = 1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__3)
{
/* Computing MIN */
i__4 = *m - i__ + 1;
chunk = min(i__4,ldwrku);
blasf77_zgemm("N", "N", &chunk, n, n, &c_b2, &a[i__ + a_dim1]
, lda, &work[ir], &ldwrkr, &c_b1, &work[iu], &
ldwrku);
lapackf77_zlacpy("F", &chunk, n, &work[iu], &ldwrku,
&a[i__ + a_dim1], lda);
}
} else {
/* Insufficient workspace for a fast algorithm */
ie = 1;
itauq = 1;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize A
(CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
(RWorkspace: N) */
i__3 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, n, &a[a_offset], lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], &i__3, &ierr);
*/
magma_zgebrd(*m, *n, &a[a_offset], *lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__3, &ierr);
/* Generate left vectors bidiagonalizing A
(CWorkspace: need 3*N, prefer 2*N+N*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
work[iwork], &i__3, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in A
(CWorkspace: need 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, m, &c__0, &s[1], &rwork[ie], cdum,
&c__1, &a[a_offset], lda, cdum, &c__1,
&rwork[irwork], info);
}
} else if (wntuo && wntvas) {
/* Path 3 (M much larger than N, JOBU_='O', JOBVT_='S' or 'A')
N left singular vectors to be overwritten on A and
N right singular vectors to be computed in VT */
if (*lwork >= *n * *n + *n * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n;
if (*lwork >= max(i__3,i__2) + *lda * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n;
if (*lwork >= max(i__3,i__2) + *n * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ldwrkr = *n;
} else {
/* WORK(IU) is LDWRKU by N and WORK(IR) is N by N */
ldwrku = (*lwork - *n * *n) / *n;
ldwrkr = *n;
}
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__3, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__3 = *n - 1;
i__2 = *n - 1;
lapackf77_zlaset("L", &i__3, &i__2, &c_b1, &c_b1, &vt[vt_dim1
+ 2], ldvt);
}
/* Generate Q in A
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__3, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT, copying result to WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__3 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &vt[vt_offset], ldvt, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &i__3, &
ierr);
*/
magma_zgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], i__3, &ierr);
lapackf77_zlacpy("L", n, n, &vt[vt_offset], ldvt, &work[ir], &
ldwrkr);
/* Generate left vectors bidiagonalizing R in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq], &
work[iwork], &i__3, &ierr);
/* Generate right vectors bidiagonalizing R in VT
(CWorkspace: need N*N+3*N-1, prefer N*N+2*N+(N-1)*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__3, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IR) and computing right
singular vectors of R in VT
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, n, &c__0, &s[1], &rwork[ie],
&vt[vt_offset], ldvt, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
iu = itauq;
/* Multiply Q in A by left singular vectors of R in
WORK(IR), storing result in WORK(IU) and copying to A
(CWorkspace: need N*N+N, prefer N*N+M*N)
(RWorkspace: 0) */
i__3 = *m;
i__2 = ldwrku;
for(i__ = 1; i__2 < 0 ? i__ >= i__3 : i__ <= i__3; i__ += i__2)
{
/* Computing MIN */
i__4 = *m - i__ + 1;
chunk = min(i__4,ldwrku);
blasf77_zgemm("N", "N", &chunk, n, n, &c_b2, &a[i__ + a_dim1]
, lda, &work[ir], &ldwrkr, &c_b1, &work[iu], &
ldwrku);
lapackf77_zlacpy("F", &chunk, n, &work[iu], &ldwrku,
&a[i__ + a_dim1], lda);
}
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &vt[vt_dim1+2], ldvt);
}
/* Generate Q in A
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &vt[vt_offset], ldvt, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &i__2, &
ierr);
*/
magma_zgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], i__2, &ierr);
/* Multiply Q in A by left vectors bidiagonalizing R
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt, &
work[itauq], &a[a_offset], lda, &work[iwork], &
i__2, &ierr);
/* Generate right vectors bidiagonalizing R in VT
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in A and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, m, &c__0, &s[1], &rwork[ie],
&vt[vt_offset], ldvt, &a[a_offset], lda, cdum, &c__1,
&rwork[irwork], info);
}
} else if (wntus) {
if (wntvn) {
/* Path 4 (M much larger than N, JOBU_='S', JOBVT_='N')
N left singular vectors to be computed in U and
no right singular vectors to be computed */
if (*lwork >= *n * *n + *n * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IR) is LDA by N */
ldwrkr = *lda;
} else {
/* WORK(IR) is N by N */
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy R to WORK(IR), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[ir + 1]
, &ldwrkr);
/* Generate Q in A
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate left vectors bidiagonalizing R in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IR)
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, n, &c__0, &s[1], &rwork[ie],
cdum, &c__1, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
/* Multiply Q in A by left singular vectors of R in
WORK(IR), storing result in U
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &a[a_offset], lda, &
work[ir], &ldwrkr, &c_b1, &u[u_offset], ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &a[a_dim1 +
2], lda);
/* Bidiagonalize R in A
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left vectors bidiagonalizing R
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, m, &c__0, &s[1], &rwork[ie],
cdum, &c__1, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
}
} else if (wntvo) {
/* Path 5 (M much larger than N, JOBU_='S', JOBVT_='O')
N left singular vectors to be computed in U and
N right singular vectors to be overwritten on A */
if (*lwork >= (*n << 1) * *n + *n * 3) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *n) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *n;
} else {
/* WORK(IU) is N by N and WORK(IR) is N by N */
ldwrku = *n;
ir = iu + ldwrku * *n;
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[iu + 1]
, &ldwrku);
/* Generate Q in A
(CWorkspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to
WORK(IR)
(CWorkspace: need 2*N*N+3*N,
prefer 2*N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("U", n, n, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate left bidiagonalizing vectors in WORK(IU)
(CWorkspace: need 2*N*N+3*N, prefer 2*N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR)
(CWorkspace: need 2*N*N+3*N-1,
prefer 2*N*N+2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IU) and computing
right singular vectors of R in WORK(IR)
(CWorkspace: need 2*N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, n, &c__0, &s[1], &rwork[ie],
&work[ir], &ldwrkr, &work[iu], &ldwrku, cdum, &c__1,
&rwork[irwork], info);
/* Multiply Q in A by left singular vectors of R in
WORK(IU), storing result in U
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &a[a_offset], lda, &
work[iu], &ldwrku, &c_b1, &u[u_offset], ldu);
/* Copy right singular vectors of R to A
(CWorkspace: need N*N)
(RWorkspace: 0) */
lapackf77_zlacpy("F", n, n, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &a[a_dim1 +
2], lda);
/* Bidiagonalize R in A
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left vectors bidiagonalizing R
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
/* Generate right vectors bidiagonalizing R in A
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &a[a_offset], lda, &work[itaup],
&work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, m, &c__0, &s[1], &rwork[ie],
&a[a_offset], lda, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
} else if (wntvas) {
/* Path 6 (M much larger than N, JOBU_='S', JOBVT_='S' or 'A')
N left singular vectors to be computed in U and
N right singular vectors to be computed in VT */
if (*lwork >= *n * *n + *n * 3) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is N by N */
ldwrku = *n;
}
itau = iu + ldwrku * *n;
iwork = itau + *n;
/* Compute A=Q*R
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[iu + 1]
, &ldwrku);
/* Generate Q in A
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to VT
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("U", n, n, &work[iu], &ldwrku, &vt[vt_offset],
ldvt);
/* Generate left bidiagonalizing vectors in WORK(IU)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT
(CWorkspace: need N*N+3*N-1,
prefer N*N+2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IU) and computing
right singular vectors of R in VT
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, n, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &work[iu], &ldwrku, cdum, &
c__1, &rwork[irwork], info);
/* Multiply Q in A by left singular vectors of R in
WORK(IU), storing result in U
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &a[a_offset], lda, &
work[iu], &ldwrku, &c_b1, &u[u_offset], ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &vt[
vt_dim1 + 2], ldvt);
}
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &vt[vt_offset], ldvt, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors
in VT
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt,
&work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
}
} else if (wntua) {
if (wntvn) {
/* Path 7 (M much larger than N, JOBU_='A', JOBVT_='N')
M left singular vectors to be computed in U and
no right singular vectors to be computed
Computing MAX */
i__2 = *n + *m, i__3 = *n * 3;
if (*lwork >= *n * *n + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IR) is LDA by N */
ldwrkr = *lda;
} else {
/* WORK(IR) is N by N */
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Copy R to WORK(IR), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[ir + 1]
, &ldwrkr);
/* Generate Q in U
(CWorkspace: need N*N+N+M, prefer N*N+N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IR)
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, n, &c__0, &s[1], &rwork[ie],
cdum, &c__1, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
/* Multiply Q in U by left singular vectors of R in
WORK(IR), storing result in A
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &u[u_offset], ldu, &
work[ir], &ldwrkr, &c_b1, &a[a_offset], lda);
/* Copy left singular vectors of A from A to U */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need N+M, prefer N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &a[a_dim1 +
2], lda);
/* Bidiagonalize R in A
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors
in A
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &c__0, m, &c__0, &s[1], &rwork[ie],
cdum, &c__1, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
}
} else if (wntvo) {
/* Path 8 (M much larger than N, JOBU_='A', JOBVT_='O')
M left singular vectors to be computed in U and
N right singular vectors to be overwritten on A
Computing MAX */
i__2 = *n + *m, i__3 = *n * 3;
if (*lwork >= (*n << 1) * *n + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *n) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *n;
} else {
/* WORK(IU) is N by N and WORK(IR) is N by N */
ldwrku = *n;
ir = iu + ldwrku * *n;
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need 2*N*N+N+M, prefer 2*N*N+N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[iu + 1]
, &ldwrku);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to
WORK(IR)
(CWorkspace: need 2*N*N+3*N,
prefer 2*N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("U", n, n, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate left bidiagonalizing vectors in WORK(IU)
(CWorkspace: need 2*N*N+3*N, prefer 2*N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR)
(CWorkspace: need 2*N*N+3*N-1,
prefer 2*N*N+2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IU) and computing
right singular vectors of R in WORK(IR)
(CWorkspace: need 2*N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, n, &c__0, &s[1], &rwork[ie], &work[
ir], &ldwrkr, &work[iu], &ldwrku, cdum, &c__1,
&rwork[irwork], info);
/* Multiply Q in U by left singular vectors of R in
WORK(IU), storing result in A
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &u[u_offset], ldu, &
work[iu], &ldwrku, &c_b1, &a[a_offset], lda);
/* Copy left singular vectors of A from A to U */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Copy right singular vectors of R from WORK(IR) to A */
lapackf77_zlacpy("F", n, n, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need N+M, prefer N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &a[a_dim1 +
2], lda);
/* Bidiagonalize R in A
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors
in A
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
/* Generate right bidiagonalizing vectors in A
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &a[a_offset], lda, &work[itaup],
&work[iwork], &i__2, &ierr);
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, m, &c__0, &s[1], &rwork[ie], &a[
a_offset], lda, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
} else if (wntvas) {
/* Path 9 (M much larger than N, JOBU_='A', JOBVT_='S'
or 'A')
M left singular vectors to be computed in U and
N right singular vectors to be computed in VT
Computing MAX */
i__2 = *n + *m, i__3 = *n * 3;
if (*lwork >= *n * *n + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is N by N */
ldwrku = *n;
}
itau = iu + ldwrku * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need N*N+2*N, prefer N*N+N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need N*N+N+M, prefer N*N+N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &work[iu + 1]
, &ldwrku);
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to VT
(CWorkspace: need N*N+3*N, prefer N*N+2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("U", n, n, &work[iu], &ldwrku, &vt[vt_offset],
ldvt);
/* Generate left bidiagonalizing vectors in WORK(IU)
(CWorkspace: need N*N+3*N, prefer N*N+2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT
(CWorkspace: need N*N+3*N-1,
prefer N*N+2*N+(N-1)*NB)
(RWorkspace: need 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of R in WORK(IU) and computing
right singular vectors of R in VT
(CWorkspace: need N*N)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, n, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &work[iu], &ldwrku, cdum, &
c__1, &rwork[irwork], info);
/* Multiply Q in U by left singular vectors of R in
WORK(IU), storing result in A
(CWorkspace: need N*N)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, n, &c_b2, &u[u_offset], ldu, &
work[iu], &ldwrku, &c_b1, &a[a_offset], lda);
/* Copy left singular vectors of A from A to U */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U
(CWorkspace: need 2*N, prefer N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U
(CWorkspace: need N+M, prefer N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R from A to VT, zeroing out below it */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_zlaset("L", &i__2, &i__3, &c_b1, &c_b1, &vt[
vt_dim1 + 2], ldvt);
}
ie = 1;
itauq = itau;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT
(CWorkspace: need 3*N, prefer 2*N+2*N*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(n, n, &vt[vt_offset], ldvt, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors
in VT
(CWorkspace: need 2*N+M, prefer 2*N+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt,
&work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
irwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
}
}
} else {
/* M .LT. MNTHR
Path 10 (M at least N, but not much larger)
Reduce to bidiagonal form without QR decomposition */
ie = 1;
itauq = 1;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize A
(CWorkspace: need 2*N+M, prefer 2*N+(M+N)*NB)
(RWorkspace: need N) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
&work[itaup], &work[iwork], &i__2, &ierr);
*/
magma_zgebrd(*m, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &work[itauq],
&work[itaup], &work[iwork], i__2, &ierr);
if (wntuas) {
/* If left singular vectors desired in U, copy result to U
and generate left bidiagonalizing vectors in U
(CWorkspace: need 2*N+NCU, prefer 2*N+NCU*NB)
(RWorkspace: 0) */
lapackf77_zlacpy("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
if (wntus) {
ncu = *n;
}
if (wntua) {
ncu = *m;
}
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, &ncu, n, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__2, &ierr);
}
if (wntvas) {
/* If right singular vectors desired in VT, copy result to
VT and generate right bidiagonalizing vectors in VT
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
lapackf77_zlacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
work[iwork], &i__2, &ierr);
}
if (wntuo) {
/* If left singular vectors desired in A, generate left
bidiagonalizing vectors in A
(CWorkspace: need 3*N, prefer 2*N+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, n, n, &a[a_offset], lda, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvo) {
/* If right singular vectors desired in A, generate right
bidiagonalizing vectors in A
(CWorkspace: need 3*N-1, prefer 2*N+(N-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", n, n, n, &a[a_offset], lda, &work[itaup], &work[
iwork], &i__2, &ierr);
}
irwork = ie + *n;
if (wntuas || wntuo) {
nru = *m;
}
if (wntun) {
nru = 0;
}
if (wntvas || wntvo) {
ncvt = *n;
}
if (wntvn) {
ncvt = 0;
}
if (! wntuo && ! wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in U and computing right singular
vectors in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
} else if (! wntuo && wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in U and computing right singular
vectors in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &a[
a_offset], lda, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
} else {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in A and computing right singular
vectors in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, cdum, &c__1, &
rwork[irwork], info);
}
}
} else {
/* A has more columns than rows. If A has sufficiently more
columns than rows, first reduce using the LQ decomposition (if
sufficient workspace available) */
if (*n >= mnthr) {
if (wntvn) {
/* Path 1t(N much larger than M, JOBVT_='N')
No right singular vectors to be computed */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork], &
i__2, &ierr);
/* Zero out above L */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &a[(a_dim1 << 1) + 1]
, lda);
ie = 1;
itauq = 1;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in A
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &work[
itauq], &work[itaup], &work[iwork], &i__2, &ierr);
*/
magma_zgebrd(*m, *m, &a[a_offset], *lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__2, &ierr);
if (wntuo || wntuas) {
/* If left singular vectors desired, generate Q
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &a[a_offset], lda, &work[itauq], &
work[iwork], &i__2, &ierr);
}
irwork = ie + *m;
nru = 0;
if (wntuo || wntuas) {
nru = *m;
}
/* Perform bidiagonal QR iteration, computing left singular
vectors of A in A if desired
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, &c__0, &nru, &c__0, &s[1], &rwork[ie], cdum, &
c__1, &a[a_offset], lda, cdum, &c__1, &rwork[irwork],
info);
/* If left singular vectors desired in U, copy them there */
if (wntuas) {
lapackf77_zlacpy("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
}
} else if (wntvo && wntun) {
/* Path 2t(N much larger than M, JOBU_='N', JOBVT_='O')
M right singular vectors to be overwritten on A and
no left singular vectors to be computed */
if (*lwork >= *m * *m + *m * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n;
if (*lwork >= max(i__2,i__3) + *lda * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n;
if (*lwork >= max(i__2,i__3) + *m * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is M by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *m;
} else {
/* WORK(IU) is M by CHUNK and WORK(IR) is M by M */
ldwrku = *m;
chunk = (*lwork - *m * *m) / *m;
ldwrkr = *m;
}
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy L to WORK(IR) and zero out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[ir], &ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[ir +
ldwrkr], &ldwrkr);
/* Generate Q in A
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &i__2, &
ierr);
*/
magma_zgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], i__2, &
ierr);
/* Generate right vectors bidiagonalizing L
(CWorkspace: need M*M+3*M-1, prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup], &
work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of L in WORK(IR)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, &c__0, &c__0, &s[1], &rwork[ie], &work[
ir], &ldwrkr, cdum, &c__1, cdum, &c__1, &rwork[
irwork], info);
iu = itauq;
/* Multiply right singular vectors of L in WORK(IR) by Q
in A, storing result in WORK(IU) and copying to A
(CWorkspace: need M*M+M, prefer M*M+M*N)
(RWorkspace: 0) */
i__2 = *n;
i__3 = chunk;
for (i__ = 1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
i__3) {
/* Computing MIN */
i__4 = *n - i__ + 1;
blk = min(i__4,chunk);
blasf77_zgemm("N", "N", m, &blk, m, &c_b2, &work[ir], &
ldwrkr, &a[i__ * a_dim1 + 1], lda, &c_b1, &
work[iu], &ldwrku);
lapackf77_zlacpy("F", m, &blk, &work[iu], &ldwrku, &a[i__ *
a_dim1 + 1], lda);
/* L30: */
}
} else {
/* Insufficient workspace for a fast algorithm */
ie = 1;
itauq = 1;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize A
(CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
(RWorkspace: need M) */
i__3 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[
itauq], &work[itaup], &work[iwork], &i__3, &ierr);
*/
magma_zgebrd(*m, *n, &a[a_offset], *lda, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__3, &ierr);
/* Generate right vectors bidiagonalizing A
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, n, m, &a[a_offset], lda, &work[itaup], &
work[iwork], &i__3, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of A in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("L", m, n, &c__0, &c__0, &s[1], &rwork[ie], &a[
a_offset], lda, cdum, &c__1, cdum, &c__1, &rwork[
irwork], info);
}
} else if (wntvo && wntuas) {
/* Path 3t(N much larger than M, JOBU_='S' or 'A', jobvt_='O')
M right singular vectors to be overwritten on A and
M left singular vectors to be computed in U */
if (*lwork >= *m * *m + *m * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n;
if (*lwork >= max(i__3,i__2) + *lda * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n;
if (*lwork >= max(i__3,i__2) + *m * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is M by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *m;
} else {
/* WORK(IU) is M by CHUNK and WORK(IR) is M by M */
ldwrku = *m;
chunk = (*lwork - *m * *m) / *m;
ldwrkr = *m;
}
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__3, &ierr);
/* Copy L to U, zeroing about above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__3 = *m - 1;
i__2 = *m - 1;
lapackf77_zlaset("U", &i__3, &i__2, &c_b1, &c_b1, &u[(u_dim1 << 1)
+ 1], ldu);
/* Generate Q in A
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__3, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U, copying result to WORK(IR)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__3 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &u[u_offset], ldu, &s[1], &rwork[ie], &work[
itauq], &work[itaup], &work[iwork], &i__3, &ierr);
*/
magma_zgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__3, &ierr);
lapackf77_zlacpy("U", m, m, &u[u_offset], ldu, &work[ir], &ldwrkr);
/* Generate right vectors bidiagonalizing L in WORK(IR)
(CWorkspace: need M*M+3*M-1, prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup], &
work[iwork], &i__3, &ierr);
/* Generate left vectors bidiagonalizing L in U
(CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
(RWorkspace: 0) */
i__3 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__3, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of L in U, and computing right
singular vectors of L in WORK(IR)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, m, &c__0, &s[1], &rwork[ie], &work[ir],
&ldwrkr, &u[u_offset], ldu, cdum, &c__1, &rwork[
irwork], info);
iu = itauq;
/* Multiply right singular vectors of L in WORK(IR) by Q
in A, storing result in WORK(IU) and copying to A
(CWorkspace: need M*M+M, prefer M*M+M*N))
(RWorkspace: 0) */
i__3 = *n;
i__2 = chunk;
for (i__ = 1; i__2 < 0 ? i__ >= i__3 : i__ <= i__3; i__ +=
i__2) {
/* Computing MIN */
i__4 = *n - i__ + 1;
blk = min(i__4,chunk);
blasf77_zgemm("N", "N", m, &blk, m, &c_b2, &work[ir], &
ldwrkr, &a[i__ * a_dim1 + 1], lda, &c_b1, &
work[iu], &ldwrku);
lapackf77_zlacpy("F", m, &blk, &work[iu], &ldwrku, &a[i__ *
a_dim1 + 1], lda);
/* L40: */
}
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &u[(u_dim1 << 1)
+ 1], ldu);
/* Generate Q in A
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &u[u_offset], ldu, &s[1], &rwork[ie], &work[
itauq], &work[itaup], &work[iwork], &i__2, &ierr);
*/
magma_zgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &rwork[ie],
&work[itauq], &work[itaup], &work[iwork], i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in A
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &u[u_offset], ldu, &work[
itaup], &a[a_offset], lda, &work[iwork], &i__2, &
ierr);
/* Generate left vectors bidiagonalizing L in U
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, m, &c__0, &s[1], &rwork[ie], &a[
a_offset], lda, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
}
} else if (wntvs) {
if (wntun) {
/* Path 4t(N much larger than M, JOBU_='N', JOBVT='S')
M right singular vectors to be computed in VT and
no left singular vectors to be computed */
if (*lwork >= *m * *m + *m * 3) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IR) is LDA by M */
ldwrkr = *lda;
} else {
/* WORK(IR) is M by M */
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IR), zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[ir +
ldwrkr], &ldwrkr);
/* Generate Q in A
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate right vectors bidiagonalizing L in
WORK(IR)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of L in WORK(IR)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, &c__0, &c__0, &s[1], &rwork[ie], &
work[ir], &ldwrkr, cdum, &c__1, cdum, &c__1, &
rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IR) by
Q in A, storing result in VT
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[ir], &ldwrkr, &
a[a_offset], lda, &c_b1, &vt[vt_offset], ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy result to VT */
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &a[(a_dim1 <<
1) + 1], lda);
/* Bidiagonalize L in A
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in VT
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, &c__0, &c__0, &s[1], &rwork[ie], &
vt[vt_offset], ldvt, cdum, &c__1, cdum, &c__1,
&rwork[irwork], info);
}
} else if (wntuo) {
/* Path 5t(N much larger than M, JOBU_='O', JOBVT='S')
M right singular vectors to be computed in VT and
M left singular vectors to be overwritten on A */
if (*lwork >= (*m << 1) * *m + *m * 3) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is LDA by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *m) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is M by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *m;
} else {
/* WORK(IU) is M by M and WORK(IR) is M by M */
ldwrku = *m;
ir = iu + ldwrku * *m;
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out below it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[iu +
ldwrku], &ldwrku);
/* Generate Q in A
(CWorkspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to
WORK(IR)
(CWorkspace: need 2*M*M+3*M,
prefer 2*M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("L", m, m, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate right bidiagonalizing vectors in WORK(IU)
(CWorkspace: need 2*M*M+3*M-1,
prefer 2*M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR)
(CWorkspace: need 2*M*M+3*M, prefer 2*M*M+2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of L in WORK(IR) and computing
right singular vectors of L in WORK(IU)
(CWorkspace: need 2*M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, m, &c__0, &s[1], &rwork[ie], &work[
iu], &ldwrku, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IU) by
Q in A, storing result in VT
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[iu], &ldwrku, &
a[a_offset], lda, &c_b1, &vt[vt_offset], ldvt);
/* Copy left singular vectors of L to A
(CWorkspace: need M*M)
(RWorkspace: 0) */
lapackf77_zlacpy("F", m, m, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &a[(a_dim1 <<
1) + 1], lda);
/* Bidiagonalize L in A
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in VT
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors of L in A
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &a[a_offset], lda, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in A and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, cdum, &
c__1, &rwork[irwork], info);
}
} else if (wntuas) {
/* Path 6t(N much larger than M, JOBU_='S' or 'A',
JOBVT='S')
M right singular vectors to be computed in VT and
M left singular vectors to be computed in U */
if (*lwork >= *m * *m + *m * 3) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is LDA by M */
ldwrku = *m;
}
itau = iu + ldwrku * *m;
iwork = itau + *m;
/* Compute A=L*Q
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[iu +
ldwrku], &ldwrku);
/* Generate Q in A
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &a[a_offset], lda, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to U
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("L", m, m, &work[iu], &ldwrku, &u[u_offset],
ldu);
/* Generate right bidiagonalizing vectors in WORK(IU)
(CWorkspace: need M*M+3*M-1,
prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U
(CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of L in U and computing right
singular vectors of L in WORK(IU)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, m, &c__0, &s[1], &rwork[ie], &work[
iu], &ldwrku, &u[u_offset], ldu, cdum, &c__1,
&rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IU) by
Q in A, storing result in VT
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[iu], &ldwrku, &
a[a_offset], lda, &c_b1, &vt[vt_offset], ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(m, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &u[u_offset],
ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &u[(u_dim1 <<
1) + 1], ldu);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &u[u_offset], ldu, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in U by Q
in VT
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &u[u_offset], ldu, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
}
} else if (wntva) {
if (wntun) {
/* Path 7t(N much larger than M, JOBU_='N', JOBVT='A')
N right singular vectors to be computed in VT and
no left singular vectors to be computed
Computing MAX */
i__2 = *n + *m, i__3 = *m * 3;
if (*lwork >= *m * *m + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IR) is LDA by M */
ldwrkr = *lda;
} else {
/* WORK(IR) is M by M */
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Copy L to WORK(IR), zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[ir +
ldwrkr], &ldwrkr);
/* Generate Q in VT
(CWorkspace: need M*M+M+N, prefer M*M+M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[ir], &ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR)
(CWorkspace: need M*M+3*M-1,
prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of L in WORK(IR)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, &c__0, &c__0, &s[1], &rwork[ie], &
work[ir], &ldwrkr, cdum, &c__1, cdum, &c__1, &
rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IR) by
Q in VT, storing result in A
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[ir], &ldwrkr, &
vt[vt_offset], ldvt, &c_b1, &a[a_offset], lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need M+N, prefer M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &a[(a_dim1 <<
1) + 1], lda);
/* Bidiagonalize L in A
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in A by Q
in VT
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, &c__0, &c__0, &s[1], &rwork[ie], &
vt[vt_offset], ldvt, cdum, &c__1, cdum, &c__1,
&rwork[irwork], info);
}
} else if (wntuo) {
/* Path 8t(N much larger than M, JOBU_='O', JOBVT='A')
N right singular vectors to be computed in VT and
M left singular vectors to be overwritten on A
Computing MAX */
i__2 = *n + *m, i__3 = *m * 3;
if (*lwork >= (*m << 1) * *m + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is LDA by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *m) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is M by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *m;
} else {
/* WORK(IU) is M by M and WORK(IR) is M by M */
ldwrku = *m;
ir = iu + ldwrku * *m;
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need 2*M*M+M+N, prefer 2*M*M+M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[iu +
ldwrku], &ldwrku);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to
WORK(IR)
(CWorkspace: need 2*M*M+3*M,
prefer 2*M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("L", m, m, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate right bidiagonalizing vectors in WORK(IU)
(CWorkspace: need 2*M*M+3*M-1,
prefer 2*M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR)
(CWorkspace: need 2*M*M+3*M, prefer 2*M*M+2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of L in WORK(IR) and computing
right singular vectors of L in WORK(IU)
(CWorkspace: need 2*M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, m, &c__0, &s[1], &rwork[ie], &work[
iu], &ldwrku, &work[ir], &ldwrkr, cdum, &c__1,
&rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IU) by
Q in VT, storing result in A
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[iu], &ldwrku, &
vt[vt_offset], ldvt, &c_b1, &a[a_offset], lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Copy left singular vectors of A from WORK(IR) to A */
lapackf77_zlacpy("F", m, m, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need M+N, prefer M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &a[(a_dim1 <<
1) + 1], lda);
/* Bidiagonalize L in A
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &a[a_offset], lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &a[a_offset], *lda, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in A by Q
in VT (CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in A
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &a[a_offset], lda, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in A and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, cdum, &
c__1, &rwork[irwork], info);
}
} else if (wntuas) {
/* Path 9t(N much larger than M, JOBU_='S' or 'A',
JOBVT='A')
N right singular vectors to be computed in VT and
M left singular vectors to be computed in U
Computing MAX */
i__2 = *n + *m, i__3 = *m * 3;
if (*lwork >= *m * *m + max(i__2,i__3)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IU) is LDA by M */
ldwrku = *lda;
} else {
/* WORK(IU) is M by M */
ldwrku = *m;
}
itau = iu + ldwrku * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need M*M+2*M, prefer M*M+M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need M*M+M+N, prefer M*M+M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &work[iu +
ldwrku], &ldwrku);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to U
(CWorkspace: need M*M+3*M, prefer M*M+2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &work[iu], &ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &work[iu], ldwrku, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_zlacpy("L", m, m, &work[iu], &ldwrku, &u[u_offset],
ldu);
/* Generate right bidiagonalizing vectors in WORK(IU)
(CWorkspace: need M*M+3*M, prefer M*M+2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U
(CWorkspace: need M*M+3*M, prefer M*M+2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of L in U and computing right
singular vectors of L in WORK(IU)
(CWorkspace: need M*M)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, m, m, &c__0, &s[1], &rwork[ie], &work[
iu], &ldwrku, &u[u_offset], ldu, cdum, &c__1,
&rwork[irwork], info);
/* Multiply right singular vectors of L in WORK(IU) by
Q in VT, storing result in A
(CWorkspace: need M*M)
(RWorkspace: 0) */
blasf77_zgemm("N", "N", m, n, m, &c_b2, &work[iu], &ldwrku, &
vt[vt_offset], ldvt, &c_b1, &a[a_offset], lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_zlacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT
(CWorkspace: need 2*M, prefer M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT
(CWorkspace: need M+N, prefer M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunglq(n, n, m, &vt[vt_offset], ldvt, &work[itau],
&work[iwork], &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &u[u_offset],
ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_zlaset("U", &i__2, &i__3, &c_b1, &c_b1, &u[(u_dim1 <<
1) + 1], ldu);
ie = 1;
itauq = itau;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U
(CWorkspace: need 3*M, prefer 2*M+2*M*NB)
(RWorkspace: need M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, m, &u[u_offset], ldu, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork], &
i__2, &ierr);
*/
magma_zgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &rwork[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in U by Q
in VT
(CWorkspace: need 2*M+N, prefer 2*M+N*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zunmbr("P", "L", "C", m, n, m, &u[u_offset], ldu, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
irwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left
singular vectors of A in U and computing right
singular vectors of A in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("U", m, n, m, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &
c__1, &rwork[irwork], info);
}
}
}
} else {
/* N .LT. MNTHR
Path 10t(N greater than M, but not much larger)
Reduce to bidiagonal form without LQ decomposition */
ie = 1;
itauq = 1;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize A
(CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
(RWorkspace: M) */
i__2 = *lwork - iwork + 1;
/*
lapackf77_zgebrd(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
&work[itaup], &work[iwork], &i__2, &ierr);
*/
magma_zgebrd(*m, *n, &a[a_offset], *lda, &s[1], &rwork[ie], &work[itauq],
&work[itaup], &work[iwork], i__2, &ierr);
if (wntuas) {
/* If left singular vectors desired in U, copy result to U
and generate left bidiagonalizing vectors in U
(CWorkspace: need 3*M-1, prefer 2*M+(M-1)*NB)
(RWorkspace: 0) */
lapackf77_zlacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvas) {
/* If right singular vectors desired in VT, copy result to
VT and generate right bidiagonalizing vectors in VT
(CWorkspace: need 2*M+NRVT, prefer 2*M+NRVT*NB)
(RWorkspace: 0) */
lapackf77_zlacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
if (wntva) {
nrvt = *n;
}
if (wntvs) {
nrvt = *m;
}
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", &nrvt, n, m, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__2, &ierr);
}
if (wntuo) {
/* If left singular vectors desired in A, generate left
bidiagonalizing vectors in A
(CWorkspace: need 3*M-1, prefer 2*M+(M-1)*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("Q", m, m, n, &a[a_offset], lda, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvo) {
/* If right singular vectors desired in A, generate right
bidiagonalizing vectors in A
(CWorkspace: need 3*M, prefer 2*M+M*NB)
(RWorkspace: 0) */
i__2 = *lwork - iwork + 1;
lapackf77_zungbr("P", m, n, m, &a[a_offset], lda, &work[itaup], &work[
iwork], &i__2, &ierr);
}
irwork = ie + *m;
if (wntuas || wntuo) {
nru = *m;
}
if (wntun) {
nru = 0;
}
if (wntvas || wntvo) {
ncvt = *n;
}
if (wntvn) {
ncvt = 0;
}
if (! wntuo && ! wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in U and computing right singular
vectors in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
} else if (! wntuo && wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in U and computing right singular
vectors in A
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &a[
a_offset], lda, &u[u_offset], ldu, cdum, &c__1, &
rwork[irwork], info);
} else {
/* Perform bidiagonal QR iteration, if desired, computing
left singular vectors in A and computing right singular
vectors in VT
(CWorkspace: 0)
(RWorkspace: need BDSPAC) */
lapackf77_zbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &rwork[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, cdum, &c__1, &
rwork[irwork], info);
}
}
}
/* Undo scaling if necessary */
if (iscl == 1) {
if (anrm > bignum) {
lapackf77_dlascl("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1,
&s[1], &minmn, &ierr);
}
if (*info != 0 && anrm > bignum) {
i__2 = minmn - 1;
lapackf77_dlascl("G", &c__0, &c__0, &bignum, &anrm, &i__2, &c__1,
&rwork[ie], &minmn, &ierr);
}
if (anrm < smlnum) {
lapackf77_dlascl("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1,
&s[1], &minmn, &ierr);
}
if (*info != 0 && anrm < smlnum) {
i__2 = minmn - 1;
lapackf77_dlascl("G", &c__0, &c__0, &smlnum, &anrm, &i__2, &c__1,
&rwork[ie], &minmn, &ierr);
}
}
return *info;
} /* magma_zgesvd */

Here is the caller graph for this function:

magma_int_t magma_zgetrf ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  A,
magma_int_t  lda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 37 of file zgetrf.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges. This version does not
require work space on the GPU passed as input. GPU memory is allocated
in the routine.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array, dimension (LDA,N)
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb)
cuDoubleComplex *dAT, *dA, *da, *work;
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magma_int_t iinfo, nb;
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (lda < max(1,m))
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
if ( (nb <= 1) || (nb >= min(m,n)) ) {
/* Use CPU code. */
lapackf77_zgetrf(&m, &n, a, &lda, ipiv, info);
} else {
/* Use hybrid blocked code. */
magma_int_t maxm, maxn, ldda, maxdim;
magma_int_t i, rows, cols, s = min(m, n)/nb;
char * num_gpus_char = getenv("MAGMA_NUM_GPUS");
magma_int_t num_gpus = 1;
if( num_gpus_char != NULL ) num_gpus = atoi(num_gpus_char);
if( num_gpus_char != NULL && num_gpus >= 1 ) {
/* call multi-GPU non-GPU-resident interface */
magma_int_t rval = magma_zgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_zgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//return magma_zgetrf3(num_gpus, m, n, a, lda, ipiv, info);
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
ldda = maxn;
work = a;
if (maxdim*maxdim < 2*maxm*maxn)
{
if (MAGMA_SUCCESS != magma_zmalloc( &dA, nb*maxm + maxdim*maxdim )) {
/* alloc failed so call non-GPU-resident version */
magma_int_t rval = magma_zgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_zgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//magma_int_t rval = magma_zgetrf_ooc(m, n, a, lda, ipiv, info);
//if( *info == 0 ) magma_zgetrf_piv( m, n, a, lda, ipiv, info);
//return *info;
}
da = dA + nb*maxm;
ldda = maxdim;
magma_zsetmatrix( m, n, a, lda, da, ldda );
dAT = da;
magmablas_zinplace_transpose( dAT, ldda, ldda );
}
else
{
if (MAGMA_SUCCESS != magma_zmalloc( &dA, (nb + maxn)*maxm )) {
/* alloc failed so call non-GPU-resident version */
magma_int_t rval = magma_zgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_zgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//magma_int_t rval = magma_zgetrf_ooc(m, n, a, lda, ipiv, info);
//if( *info == 0 )magma_zgetrf_piv( m, n, a, lda, ipiv, info);
//return *info;
}
da = dA + nb*maxm;
magma_zsetmatrix( m, n, a, lda, da, maxm );
if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
/* alloc failed so call non-GPU-resident version */
magma_free( dA );
magma_int_t rval = magma_zgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_zgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//magma_int_t rval = magma_zgetrf_ooc(m, n, a, lda, ipiv, info);
//magma_zgetrf_piv( m, n, a, lda, ipiv, info);
//return *info;
}
magmablas_ztranspose2( dAT, ldda, da, maxm, m, n );
}
lapackf77_zgetrf( &m, &nb, work, &lda, ipiv, &iinfo);
for( i = 0; i < s; i++ )
{
// download i-th panel
cols = maxm - i*nb;
if (i>0){
magmablas_ztranspose( dA, cols, inAT(i,i), ldda, nb, cols );
magma_zgetmatrix( m-i*nb, nb, dA, cols, work, lda );
// make sure that gpu queue is empty
n - (i+1)*nb, nb,
c_one, inAT(i-1,i-1), ldda,
inAT(i-1,i+1), ldda );
n-(i+1)*nb, m-i*nb, nb,
c_neg_one, inAT(i-1,i+1), ldda,
inAT(i, i-1), ldda,
c_one, inAT(i, i+1), ldda );
// do the cpu part
rows = m - i*nb;
lapackf77_zgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo);
}
if (*info == 0 && iinfo > 0)
*info = iinfo + i*nb;
magmablas_zpermute_long2( dAT, ldda, ipiv, nb, i*nb );
// upload i-th panel
magma_zsetmatrix( m-i*nb, nb, work, lda, dA, cols );
magmablas_ztranspose( inAT(i,i), ldda, dA, cols, cols, nb);
// do the small non-parallel computations
if (s > (i+1)){
nb, nb,
c_one, inAT(i, i ), ldda,
inAT(i, i+1), ldda);
nb, m-(i+1)*nb, nb,
c_neg_one, inAT(i, i+1), ldda,
inAT(i+1, i ), ldda,
c_one, inAT(i+1, i+1), ldda );
}
else{
n-s*nb, nb,
c_one, inAT(i, i ), ldda,
inAT(i, i+1), ldda);
n-(i+1)*nb, m-(i+1)*nb, nb,
c_neg_one, inAT(i, i+1), ldda,
inAT(i+1, i ), ldda,
c_one, inAT(i+1, i+1), ldda );
}
}
magma_int_t nb0 = min(m - s*nb, n - s*nb);
if ( nb0 > 0 ) {
rows = m - s*nb;
cols = maxm - s*nb;
magmablas_ztranspose2( dA, cols, inAT(s,s), ldda, nb0, rows);
magma_zgetmatrix( rows, nb0, dA, cols, work, lda );
// make sure that gpu queue is empty
// do the cpu part
lapackf77_zgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo);
if (*info == 0 && iinfo > 0)
*info = iinfo + s*nb;
magmablas_zpermute_long2( dAT, ldda, ipiv, nb0, s*nb );
magma_zsetmatrix( rows, nb0, work, lda, dA, cols );
magmablas_ztranspose2( inAT(s,s), ldda, dA, cols, rows, nb0);
n-s*nb-nb0, nb0,
c_one, inAT(s, s), ldda,
inAT(s, s)+nb0, ldda);
}
if (maxdim*maxdim< 2*maxm*maxn){
magmablas_zinplace_transpose( dAT, ldda, ldda );
magma_zgetmatrix( m, n, da, ldda, a, lda );
} else {
magmablas_ztranspose2( da, maxm, dAT, ldda, n, m );
magma_zgetmatrix( m, n, da, maxm, a, lda );
magma_free( dAT );
}
magma_free( dA );
}
return *info;
} /* magma_zgetrf */

Here is the caller graph for this function:

magma_int_t magma_zgetrf2 ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  a,
magma_int_t  lda,
magma_int_t ipiv,
magma_int_t info 
)
magma_int_t magma_zgetrf_gpu ( magma_int_t  m,
magma_int_t  n,
cuDoubleComplex *  dA,
magma_int_t  ldda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 22 of file zgetrf_gpu.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define inAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)
cuDoubleComplex c_one = MAGMA_Z_ONE;
cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magma_int_t iinfo, nb;
magma_int_t maxm, maxn, mindim;
magma_int_t i, rows, cols, s, lddat, lddwork;
cuDoubleComplex *dAT, *dAP, *work;
/* Check arguments */
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (ldda <