MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
magma_s.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define PRECISION_s

Functions

magma_int_t magma_sgebrd (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tauq, float *taup, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgehrd2 (magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t *lwork, magma_int_t *info)
magma_int_t magma_sgehrd (magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *d_T, magma_int_t *info)
magma_int_t magma_sgelqf (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqlf (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqrf (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqrf4 (magma_int_t num_gpus, magma_int_t m, magma_int_t n, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqrf_ooc (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgesv (magma_int_t n, magma_int_t nrhs, float *A, magma_int_t lda, magma_int_t *ipiv, float *B, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_sgetrf (magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_sgetrf2 (magma_int_t m, magma_int_t n, float *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_slatrd (char uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *e, float *tau, float *w, magma_int_t ldw, float *da, magma_int_t ldda, float *dw, magma_int_t lddw)
magma_int_t magma_slatrd2 (char uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *e, float *tau, float *w, magma_int_t ldw, float *da, magma_int_t ldda, float *dw, magma_int_t lddw, float *dwork, magma_int_t ldwork)
magma_int_t magma_slahr2 (magma_int_t m, magma_int_t n, magma_int_t nb, float *da, float *dv, float *a, magma_int_t lda, float *tau, float *t, magma_int_t ldt, float *y, magma_int_t ldy)
magma_int_t magma_slahru (magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb, float *a, magma_int_t lda, float *da, float *y, float *v, float *t, float *dwork)
magma_int_t magma_sposv (char uplo, magma_int_t n, magma_int_t nrhs, float *A, magma_int_t lda, float *B, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_spotrf (char uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_spotri (char uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_slauum (char uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_strtri (char uplo, char diag, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info)
magma_int_t magma_ssytrd (char uplo, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sorgqr (magma_int_t m, magma_int_t n, magma_int_t k, float *a, magma_int_t lda, float *tau, float *dwork, magma_int_t nb, magma_int_t *info)
magma_int_t magma_sormql (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *a, magma_int_t lda, float *tau, float *c, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sormqr (char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *a, magma_int_t lda, float *tau, float *c, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sormtr (char side, char uplo, char trans, magma_int_t m, magma_int_t n, float *a, magma_int_t lda, float *tau, float *c, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sorghr (magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *a, magma_int_t lda, float *tau, float *dT, magma_int_t nb, magma_int_t *info)
magma_int_t magma_ssyev (char jobz, char uplo, magma_int_t n, float *a, magma_int_t lda, float *w, float *work, magma_int_t lwork, float *rwork, magma_int_t *info)
magma_int_t magma_ssyevx (char jobz, char range, char uplo, magma_int_t n, float *a, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, float *z, magma_int_t ldz, float *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info)
magma_int_t magma_sgeev (char jobvl, char jobvr, magma_int_t n, float *a, magma_int_t lda, float *wr, float *wi, float *vl, magma_int_t ldvl, float *vr, magma_int_t ldvr, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqp3 (magma_int_t *m, magma_int_t *n, float *a, magma_int_t *lda, magma_int_t *jpvt, float *tau, float *work, magma_int_t *lwork, magma_int_t *info)
magma_int_t magma_sgesvd (char jobu, char jobvt, magma_int_t m, magma_int_t n, float *a, magma_int_t lda, float *s, float *u, magma_int_t ldu, float *vt, magma_int_t ldvt, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_ssyevd (char jobz, char uplo, magma_int_t n, float *a, magma_int_t lda, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_ssygvd (magma_int_t itype, char jobz, char uplo, magma_int_t n, float *a, magma_int_t lda, float *b, magma_int_t ldb, float *w, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_sstedx (char range, magma_int_t n, float vl, float vu, magma_int_t il, magma_int_t iu, float *d, float *e, float *z, magma_int_t ldz, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, float *dwork, magma_int_t *info)
magma_int_t magma_slaex0 (magma_int_t n, float *d, float *e, float *q, magma_int_t ldq, float *work, magma_int_t *iwork, float *dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info)
magma_int_t magma_slaex1 (magma_int_t n, float *d, float *q, magma_int_t ldq, magma_int_t *indxq, float rho, magma_int_t cutpnt, float *work, magma_int_t *iwork, float *dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info)
magma_int_t magma_slaex3 (magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *q, magma_int_t ldq, float rho, float *dlamda, float *q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, float *dwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info)
magma_int_t magma_ssygst (magma_int_t itype, char uplo, magma_int_t n, float *a, magma_int_t lda, float *b, magma_int_t ldb, magma_int_t *info)
magma_int_t magma_sgels_gpu (char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgels3_gpu (char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgelqf_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *tau, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgeqrf_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t *info)
magma_int_t magma_sgeqrf2_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *tau, magma_int_t *info)
magma_int_t magma_sgeqrf2_mgpu (magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info)
magma_int_t magma_sgeqrf3_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t *info)
magma_int_t magma_sgeqrs_gpu (magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *tau, float *dT, float *dB, magma_int_t lddb, float *hwork, magma_int_t lhwork, magma_int_t *info)
magma_int_t magma_sgeqrs3_gpu (magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *tau, float *dT, float *dB, magma_int_t lddb, float *hwork, magma_int_t lhwork, magma_int_t *info)
magma_int_t magma_sgessm_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, float *dL1, magma_int_t lddl1, float *dL, magma_int_t lddl, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_sgesv_gpu (magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_sgetrl_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t ib, float *hA, magma_int_t ldha, float *dA, magma_int_t ldda, float *hL, magma_int_t ldhl, float *dL, magma_int_t lddl, magma_int_t *ipiv, float *dwork, magma_int_t lddwork, magma_int_t *info)
magma_int_t magma_sgetrf_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_sgetrf_nopiv_gpu (magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_sgetri_gpu (magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dwork, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_sgetrs_gpu (char trans, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_slabrd_gpu (magma_int_t m, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *da, magma_int_t ldda, float *d, float *e, float *tauq, float *taup, float *x, magma_int_t ldx, float *dx, magma_int_t lddx, float *y, magma_int_t ldy, float *dy, magma_int_t lddy)
magma_int_t magma_slarfb_gpu (char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, float *dv, magma_int_t ldv, float *dt, magma_int_t ldt, float *dc, magma_int_t ldc, float *dowrk, magma_int_t ldwork)
magma_int_t magma_sposv_gpu (char uplo, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_spotrf_gpu (char uplo, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_spotri_gpu (char uplo, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_slauum_gpu (char uplo, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_strtri_gpu (char uplo, char diag, magma_int_t n, float *dA, magma_int_t ldda, magma_int_t *info)
magma_int_t magma_ssytrd_gpu (char uplo, magma_int_t n, float *da, magma_int_t ldda, float *d, float *e, float *tau, float *wa, magma_int_t ldwa, float *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_ssytrd2_gpu (char uplo, magma_int_t n, float *da, magma_int_t ldda, float *d, float *e, float *tau, float *wa, magma_int_t ldwa, float *work, magma_int_t lwork, float *dwork, magma_int_t ldwork, magma_int_t *info)
magma_int_t magma_spotrs_gpu (char uplo, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, magma_int_t *info)
magma_int_t magma_sssssm_gpu (char storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, float *dA1, magma_int_t ldda1, float *dA2, magma_int_t ldda2, float *dL1, magma_int_t lddl1, float *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info)
magma_int_t magma_ststrf_gpu (char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, float *hU, magma_int_t ldhu, float *dU, magma_int_t lddu, float *hA, magma_int_t ldha, float *dA, magma_int_t ldda, float *hL, magma_int_t ldhl, float *dL, magma_int_t lddl, magma_int_t *ipiv, float *hwork, magma_int_t ldhwork, float *dwork, magma_int_t lddwork, magma_int_t *info)
magma_int_t magma_sorgqr_gpu (magma_int_t m, magma_int_t n, magma_int_t k, float *da, magma_int_t ldda, float *tau, float *dwork, magma_int_t nb, magma_int_t *info)
magma_int_t magma_sormql2_gpu (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *da, magma_int_t ldda, float *tau, float *dc, magma_int_t lddc, float *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_sormqr_gpu (char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *a, magma_int_t lda, float *tau, float *c, magma_int_t ldc, float *work, magma_int_t lwork, float *td, magma_int_t nb, magma_int_t *info)
magma_int_t magma_sormqr2_gpu (const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *da, magma_int_t ldda, float *tau, float *dc, magma_int_t lddc, float *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_sormtr_gpu (char side, char uplo, char trans, magma_int_t m, magma_int_t n, float *da, magma_int_t ldda, float *tau, float *dc, magma_int_t lddc, float *wa, magma_int_t ldwa, magma_int_t *info)
magma_int_t magma_ssyevd_gpu (char jobz, char uplo, magma_int_t n, float *da, magma_int_t ldda, float *w, float *wa, magma_int_t ldwa, float *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
magma_int_t magma_ssyevx_gpu (char jobz, char range, char uplo, magma_int_t n, float *da, magma_int_t ldda, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, float *dz, magma_int_t lddz, float *wa, magma_int_t ldwa, float *wz, magma_int_t ldwz, float *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info)
magma_int_t magma_ssygst_gpu (magma_int_t itype, char uplo, magma_int_t n, float *da, magma_int_t ldda, float *db, magma_int_t lddb, magma_int_t *info)
void magma_sprint (int m, int n, float *A, int lda)
void magma_sprint_gpu (int m, int n, float *dA, int ldda)
void spanel_to_q (char uplo, int ib, float *A, int lda, float *work)
void sq_to_panel (char uplo, int ib, float *A, int lda, float *work)

Macro Definition Documentation

#define PRECISION_s

Definition at line 13 of file magma_s.h.


Function Documentation

magma_int_t magma_sgebrd ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
float *  d,
float *  e,
float *  tauq,
float *  taup,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 24 of file sgebrd.cpp.

References __func__, A, dA, dwork, lapackf77_sgebrd(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgebrd_nb(), MAGMA_S_MAKE, MAGMA_S_NEG_ONE, MAGMA_S_ONE, MAGMA_S_SET2REAL, magma_sgemm, magma_sgetmatrix(), magma_slabrd_gpu(), magma_smalloc(), magma_ssetmatrix(), MAGMA_SUCCESS, magma_xerbla(), MagmaNoTrans, MagmaTrans, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEBRD reduces a general real M-by-N matrix A to upper or lower
bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
Arguments
=========
M (input) INTEGER
The number of rows in the matrix A. M >= 0.
N (input) INTEGER
The number of columns in the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N general matrix to be reduced.
On exit,
if m >= n, the diagonal and the first superdiagonal are
overwritten with the upper bidiagonal matrix B; the
elements below the diagonal, with the array TAUQ, represent
the orthogonal matrix Q as a product of elementary
reflectors, and the elements above the first superdiagonal,
with the array TAUP, represent the orthogonal matrix P as
a product of elementary reflectors;
if m < n, the diagonal and the first subdiagonal are
overwritten with the lower bidiagonal matrix B; the
elements below the first subdiagonal, with the array TAUQ,
represent the orthogonal matrix Q as a product of
elementary reflectors, and the elements above the diagonal,
with the array TAUP, represent the orthogonal matrix P as
a product of elementary reflectors.
See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
D (output) real array, dimension (min(M,N))
The diagonal elements of the bidiagonal matrix B:
D(i) = A(i,i).
E (output) real array, dimension (min(M,N)-1)
The off-diagonal elements of the bidiagonal matrix B:
if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.
TAUQ (output) REAL array dimension (min(M,N))
The scalar factors of the elementary reflectors which
represent the orthogonal matrix Q. See Further Details.
TAUP (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors which
represent the orthogonal matrix P. See Further Details.
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,M,N).
For optimum performance LWORK >= (M+N)*NB, where NB
is the optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrices Q and P are represented as products of elementary
reflectors:
If m >= n,
Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1)
Each H(i) and G(i) has the form:
H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
where tauq and taup are real scalars, and v and u are real vectors;
v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
tauq is stored in TAUQ(i) and taup in TAUP(i).
If m < n,
Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m)
Each H(i) and G(i) has the form:
H(i) = I - tauq * v * v' and G(i) = I - taup * u * u'
where tauq and taup are real scalars, and v and u are real vectors;
v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
tauq is stored in TAUQ(i) and taup in TAUP(i).
The contents of A on exit are illustrated by the following examples:
m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n):
( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 )
( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 )
( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 )
( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 )
( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 )
( v1 v2 v3 v4 v5 )
where d and e denote diagonal and off-diagonal elements of B, vi
denotes an element of the vector defining H(i), and ui an element of
the vector defining G(i).
===================================================================== */
float c_neg_one = MAGMA_S_NEG_ONE;
float c_one = MAGMA_S_ONE;
float *da, *dwork;
magma_int_t ncol, nrow, jmax, nb, ldda;
static magma_int_t i, j, nx;
static float ws;
static magma_int_t iinfo;
magma_int_t minmn;
magma_int_t ldwrkx, ldwrky, lwkopt;
magma_int_t lquery;
ldda = m;
lwkopt = (m + n) * nb;
work[0] = MAGMA_S_MAKE( lwkopt, 0. );
lquery = (lwork == -1);
/* Check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if ( (lwork < max( max(1, m), n)) && (! lquery) ) {
*info = -10;
}
if (*info < 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
minmn = min(m,n);
if (minmn == 0) {
work[0] = c_one;
return *info;
}
if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda + (m + n)*nb )) {
fprintf (stderr, "!!!! device memory allocation error in sgebrd\n" );
return *info;
}
dwork = da + (n)*ldda;
MAGMA_S_SET2REAL( ws, max(m,n) );
ldwrkx = m;
ldwrky = n;
/* Set the block/unblock crossover point NX. */
nx = 128;
/* Copy the matrix to the GPU */
if (minmn-nx>=1)
magma_ssetmatrix( m, n, a, lda, da, ldda );
for (i=0; i< (minmn - nx); i += nb) {
/* Reduce rows and columns i:i+nb-1 to bidiagonal form and return
the matrices X and Y which are needed to update the unreduced
part of the matrix */
nrow = m - i;
ncol = n - i;
/* Get the current panel (no need for the 1st iteration) */
if ( i > 0 ) {
magma_sgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda );
magma_sgetmatrix( nb, ncol - nb,
dA(i, i+nb), ldda,
A( i, i+nb), lda );
}
magma_slabrd_gpu(nrow, ncol, nb,
A(i, i), lda, dA(i, i), ldda,
d+i, e+i, tauq+i, taup+i,
work, ldwrkx, dwork, ldwrkx, // x, dx
work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy
/* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
of the form A := A - V*Y' - X*U' */
nrow = m - i - nb;
ncol = n - i - nb;
// Send Y back to the GPU
magma_ssetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx );
magma_ssetmatrix( ncol, nb,
work + (ldwrkx+1)*nb, ldwrky,
dwork + (ldwrkx+1)*nb, ldwrky );
nrow, ncol, nb,
c_neg_one, dA(i+nb, i ), ldda,
dwork+(ldwrkx+1)*nb, ldwrky,
c_one, dA(i+nb, i+nb), ldda);
nrow, ncol, nb,
c_neg_one, dwork+nb, ldwrkx,
dA( i, i+nb ), ldda,
c_one, dA( i+nb, i+nb ), ldda);
/* Copy diagonal and off-diagonal elements of B back into A */
if (m >= n) {
jmax = i + nb;
for (j = i; j < jmax; ++j) {
*A(j, j ) = MAGMA_S_MAKE( d[j], 0. );
*A(j, j+1) = MAGMA_S_MAKE( e[j], 0. );
}
} else {
jmax = i + nb;
for (j = i; j < jmax; ++j) {
*A(j, j ) = MAGMA_S_MAKE( d[j], 0. );
*A(j+1, j ) = MAGMA_S_MAKE( e[j], 0. );
/* L20: */
}
}
}
/* Use unblocked code to reduce the remainder of the matrix */
nrow = m - i;
ncol = n - i;
if ( 0 < (minmn-nx) )
magma_sgetmatrix( nrow, ncol, dA(i, i), ldda, A( i, i), lda );
lapackf77_sgebrd( &nrow, &ncol,
A(i, i), &lda, d+i, e+i,
tauq+i, taup+i, work, &lwork, &iinfo);
work[0] = ws;
magma_free( da );
return *info;
} /* sgebrd_ */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeev ( char  jobvl,
char  jobvr,
magma_int_t  n,
float *  a,
magma_int_t  lda,
float *  wr,
float *  wi,
float *  vl,
magma_int_t  ldvl,
float *  vr,
magma_int_t  ldvr,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 25 of file sgeev.cpp.

References __func__, cblas_isamax(), cblas_snrm2(), cblas_srot(), cblas_sscal(), dT, lapackf77_lsame, lapackf77_sgebak(), lapackf77_sgebal(), lapackf77_sgehrd(), lapackf77_shseqr(), lapackf77_slabad, lapackf77_slacpy(), lapackf77_slamch, lapackf77_slange(), lapackf77_slapy2, lapackf77_slartg(), lapackf77_slascl(), lapackf77_sorghr(), lapackf77_strevc(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgehrd_nb(), magma_sgehrd(), magma_sgehrd2(), magma_smalloc(), magma_sorghr(), magma_ssqrt, MAGMA_SUCCESS, magma_xerbla(), MagmaLowerStr, max, side, and codegen::work.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEEV computes for an N-by-N real nonsymmetric matrix A, the
eigenvalues and, optionally, the left and/or right eigenvectors.
The right eigenvector v(j) of A satisfies
A * v(j) = lambda(j) * v(j)
where lambda(j) is its eigenvalue.
The left eigenvector u(j) of A satisfies
u(j)**T * A = lambda(j) * u(j)**T
where u(j)**T denotes the transpose of u(j).
The computed eigenvectors are normalized to have Euclidean norm
equal to 1 and largest component real.
Arguments
=========
JOBVL (input) CHARACTER*1
= 'N': left eigenvectors of A are not computed;
= 'V': left eigenvectors of are computed.
JOBVR (input) CHARACTER*1
= 'N': right eigenvectors of A are not computed;
= 'V': right eigenvectors of A are computed.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) DOUBLE PRECISION array, dimension (LDA,N)
On entry, the N-by-N matrix A.
On exit, A has been overwritten.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
WR (output) DOUBLE PRECISION array, dimension (N)
WI (output) DOUBLE PRECISION array, dimension (N)
WR and WI contain the real and imaginary parts,
respectively, of the computed eigenvalues. Complex
conjugate pairs of eigenvalues appear consecutively
with the eigenvalue having the positive imaginary part
first.
VL (output) DOUBLE PRECISION array, dimension (LDVL,N)
If JOBVL = 'V', the left eigenvectors u(j) are stored one
after another in the columns of VL, in the same order
as their eigenvalues.
If JOBVL = 'N', VL is not referenced.
u(j) = VL(:,j), the j-th column of VL.
LDVL (input) INTEGER
The leading dimension of the array VL. LDVL >= 1; if
JOBVL = 'V', LDVL >= N.
VR (output) DOUBLE PRECISION array, dimension (LDVR,N)
If JOBVR = 'V', the right eigenvectors v(j) are stored one
after another in the columns of VR, in the same order
as their eigenvalues.
If JOBVR = 'N', VR is not referenced.
v(j) = VR(:,j), the j-th column of VR.
LDVR (input) INTEGER
The leading dimension of the array VR. LDVR >= 1; if
JOBVR = 'V', LDVR >= N.
WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= (1+nb)*N.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
> 0: if INFO = i, the QR algorithm failed to compute all the
eigenvalues, and no eigenvectors have been computed;
elements and i+1:N of W contain eigenvalues which have
converged.
===================================================================== */
magma_int_t c__1 = 1;
magma_int_t c__0 = 0;
magma_int_t c_n1 = -1;
magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
i__2, i__3;
float d__1, d__2;
magma_int_t i__, k, ihi, ilo;
float r__, cs, sn, scl;
float dum[1], eps;
float anrm;
magma_int_t ierr, itau, iwrk, nout;
magma_int_t scalea;
float cscale;
float bignum;
magma_int_t minwrk;
magma_int_t wantvl;
float smlnum;
magma_int_t lquery, wantvr, select[1];
magma_int_t nb = 0;
float *dT = NULL;
//magma_timestr_t start, end;
char side[2] = {0, 0};
char jobvl_[2] = {jobvl, 0};
char jobvr_[2] = {jobvr, 0};
*info = 0;
lquery = lwork == -1;
wantvl = lapackf77_lsame(jobvl_, "V");
wantvr = lapackf77_lsame(jobvr_, "V");
if (! wantvl && ! lapackf77_lsame(jobvl_, "N")) {
*info = -1;
} else if (! wantvr && ! lapackf77_lsame(jobvr_, "N")) {
*info = -2;
} else if (n < 0) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
*info = -9;
} else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
*info = -11;
}
/* Compute workspace */
if (*info == 0) {
minwrk = (2+nb)*n;
work[0] = (float) minwrk;
if (lwork < minwrk && ! lquery) {
*info = -13;
}
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (n == 0) {
return *info;
}
// if eigenvectors are needed
#if defined(VERSION3)
if (MAGMA_SUCCESS != magma_smalloc( &dT, nb*n )) {
return *info;
}
#endif
// subtract row and col for 1-based indexing
a_dim1 = lda;
a_offset = 1 + a_dim1;
a -= a_offset;
vl_dim1 = ldvl;
vl_offset = 1 + vl_dim1;
vl -= vl_offset;
vr_dim1 = ldvr;
vr_offset = 1 + vr_dim1;
vr -= vr_offset;
--work;
/* Get machine constants */
eps = lapackf77_slamch("P");
smlnum = lapackf77_slamch("S");
bignum = 1. / smlnum;
lapackf77_slabad(&smlnum, &bignum);
smlnum = magma_ssqrt(smlnum) / eps;
bignum = 1. / smlnum;
/* Scale A if max element outside range [SMLNUM,BIGNUM] */
anrm = lapackf77_slange("M", &n, &n, &a[a_offset], &lda, dum);
scalea = 0;
if (anrm > 0. && anrm < smlnum) {
scalea = 1;
cscale = smlnum;
} else if (anrm > bignum) {
scalea = 1;
cscale = bignum;
}
if (scalea) {
lapackf77_slascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n,
&a[a_offset], &lda, &ierr);
}
/* Balance the matrix
(Workspace: need N) */
ibal = 1;
lapackf77_sgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr);
/* Reduce to upper Hessenberg form
(Workspace: need 3*N, prefer 2*N+N*NB) */
itau = ibal + n;
iwrk = itau + n;
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1)
/*
* Version 1 - LAPACK
*/
lapackf77_sgehrd(&n, &ilo, &ihi, &a[a_offset], &lda,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION2)
/*
* Version 2 - LAPACK consistent HRD
*/
magma_sgehrd2(n, ilo, ihi, &a[a_offset], lda,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored,
*/
magma_sgehrd(n, ilo, ihi, &a[a_offset], lda,
&work[itau], &work[iwrk], i__1, dT, &ierr);
#endif
//end = get_current_time();
//printf(" Time for sgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.);
if (wantvl) {
/* Want left eigenvectors
Copy Householder vectors to VL */
side[0] = 'L';
&a[a_offset], &lda, &vl[vl_offset], &ldvl);
/*
* Generate orthogonal matrix in VL
* (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
*/
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
/*
* Version 1 & 2 - LAPACK
*/
lapackf77_sorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
*/
magma_sorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau],
dT, nb, &ierr);
#endif
//end = get_current_time();
//printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);
/*
* Perform QR iteration, accumulating Schur vectors in VL
* (Workspace: need N+1, prefer N+HSWORK (see comments) )
*/
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
&vl[vl_offset], &ldvl, &work[iwrk], &i__1, info);
if (wantvr) {
/* Want left and right eigenvectors
Copy Schur vectors to VR */
side[0] = 'B';
lapackf77_slacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr);
}
} else if (wantvr) {
/* Want right eigenvectors
Copy Householder vectors to VR */
side[0] = 'R';
lapackf77_slacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr);
/*
* Generate orthogonal matrix in VR
* (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
*/
i__1 = lwork - iwrk + 1;
//start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
/*
* Version 1 & 2 - LAPACK
*/
lapackf77_sorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr,
&work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
/*
* Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
*/
magma_sorghr(n, ilo, ihi, &vr[vr_offset], ldvr,
&work[itau], dT, nb, &ierr);
#endif
//end = get_current_time();
//printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);
/*
* Perform QR iteration, accumulating Schur vectors in VR
* (Workspace: need N+1, prefer N+HSWORK (see comments) )
*/
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
&vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
} else {
/*
* Compute eigenvalues only
* (Workspace: need N+1, prefer N+HSWORK (see comments) )
*/
iwrk = itau;
i__1 = lwork - iwrk + 1;
lapackf77_shseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
&vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
}
/* If INFO > 0 from ZHSEQR, then quit */
if (*info > 0) {
fprintf(stderr, "ZHSEQR returned with info = %d\n", *info);
goto L50;
}
if (wantvl || wantvr) {
/*
* Compute left and/or right eigenvectors
* (Workspace: need 4*N)
*/
lapackf77_strevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl,
&vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr);
}
if (wantvl) {
/*
* Undo balancing of left eigenvectors
* (Workspace: need N)
*/
lapackf77_sgebak("B", "L", &n, &ilo, &ihi,
&work[ibal], &n, &vl[vl_offset], &ldvl, &ierr);
/* Normalize left eigenvectors and make largest component real */
for (i__ = 1; i__ <= n; ++i__) {
if ( WI[i__-1] == 0.) {
scl = cblas_snrm2(n, &vl[i__ * vl_dim1 + 1], 1);
scl = 1. / scl;
cblas_sscal(n, (scl), &vl[i__ * vl_dim1 + 1], 1);
} else if (WI[i__-1] > 0.) {
d__1 = cblas_snrm2(n, &vl[ i__ * vl_dim1 + 1], 1);
d__2 = cblas_snrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1);
scl = lapackf77_slapy2(&d__1, &d__2);
scl = 1. / scl;
cblas_sscal(n, (scl), &vl[ i__ * vl_dim1 + 1], 1);
cblas_sscal(n, (scl), &vl[(i__ + 1) * vl_dim1 + 1], 1);
i__2 = n;
for (k = 1; k <= i__2; ++k) {
/* Computing 2nd power */
d__1 = vl[k + i__ * vl_dim1];
/* Computing 2nd power */
d__2 = vl[k + (i__ + 1) * vl_dim1];
work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
}
/* Comment:
Fortran BLAS does not have to add 1
C BLAS must add one to cblas_isamax */
k = cblas_isamax(n, &work[iwrk], 1)+1;
lapackf77_slartg(&vl[k + i__ * vl_dim1],
&vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__);
cblas_srot(n, &vl[ i__ * vl_dim1 + 1], 1,
&vl[(i__ + 1) * vl_dim1 + 1], 1, cs, (sn));
vl[k + (i__ + 1) * vl_dim1] = 0.;
}
}
}
if (wantvr) {
/*
* Undo balancing of right eigenvectors
* (Workspace: need N)
*/
lapackf77_sgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n,
&vr[vr_offset], &ldvr, &ierr);
/* Normalize right eigenvectors and make largest component real */
for (i__ = 1; i__ <= n; ++i__) {
if (WI[i__-1] == 0.) {
scl = 1. / cblas_snrm2(n, &vr[i__ * vr_dim1 + 1], 1);
cblas_sscal(n, (scl), &vr[i__ * vr_dim1 + 1], 1);
} else if (WI[i__-1] > 0.) {
d__1 = cblas_snrm2(n, &vr[ i__ * vr_dim1 + 1], 1);
d__2 = cblas_snrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1);
scl = lapackf77_slapy2(&d__1, &d__2);
scl = 1. / scl;
cblas_sscal(n, (scl), &vr[ i__ * vr_dim1 + 1], 1);
cblas_sscal(n, (scl), &vr[(i__ + 1) * vr_dim1 + 1], 1);
i__2 = n;
for (k = 1; k <= i__2; ++k) {
/* Computing 2nd power */
d__1 = vr[k + i__ * vr_dim1];
/* Computing 2nd power */
d__2 = vr[k + (i__ + 1) * vr_dim1];
work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
}
/* Comment:
Fortran BLAS does not have to add 1
C BLAS must add one to cblas_isamax */
k = cblas_isamax(n, &work[iwrk], 1)+1;
lapackf77_slartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1],
&cs, &sn, &r__);
cblas_srot(n, &vr[ i__ * vr_dim1 + 1], 1,
&vr[(i__ + 1) * vr_dim1 + 1], 1, cs, (sn));
vr[k + (i__ + 1) * vr_dim1] = 0.;
}
}
}
/* Undo scaling if necessary */
L50:
if (scalea) {
i__1 = n - *info;
/* Computing MAX */
i__3 = n - *info;
i__2 = max(i__3,1);
lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
WR + (*info), &i__2, &ierr);
i__1 = n - *info;
/* Computing MAX */
i__3 = n - *info;
i__2 = max(i__3,1);
lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
WI + (*info), &i__2, &ierr);
if (*info > 0) {
i__1 = ilo - 1;
lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
WR, &n, &ierr);
i__1 = ilo - 1;
lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
WI, &n, &ierr);
}
}
#if defined(VERSION3)
magma_free( dT );
#endif
return *info;
} /* magma_sgeev */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgehrd ( magma_int_t  n,
magma_int_t  ilo,
magma_int_t  ihi,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
float *  d_T,
magma_int_t info 
)

Definition at line 14 of file sgehrd.cpp.

References __func__, lapackf77_sgehd2(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_get_sgehrd_nb(), MAGMA_S_ONE, MAGMA_S_SET2REAL, MAGMA_S_ZERO, magma_sgetmatrix(), magma_slahr2(), magma_slahru(), magma_smalloc(), magma_ssetmatrix(), MAGMA_SUCCESS, magma_xerbla(), max, min, gbstrct_blg::N, and szero_nbxnb_block().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by
an orthogonal similarity transformation: Q' * A * Q = H . This version
stores the triangular matrices used in the factorization so that they can
be applied directly (i.e., without being recomputed) later. As a result,
the application of Q is much faster.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
ILO (input) INTEGER
IHI (input) INTEGER
It is assumed that A is already upper triangular in rows
and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
set by a previous call to DGEBAL; otherwise they should be
set to 1 and N respectively. See Further Details.
1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the N-by-N general matrix to be reduced.
On exit, the upper triangle and the first subdiagonal of A
are overwritten with the upper Hessenberg matrix H, and the
elements below the first subdiagonal, with the array TAU,
represent the orthogonal matrix Q as a product of elementary
reflectors. See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
TAU (output) REAL array, dimension (N-1)
The scalar factors of the elementary reflectors (see Further
Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
zero.
WORK (workspace/output) REAL array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
dT (output) REAL array on the GPU, dimension N*NB,
where NB is the optimal blocksize. It stores the NB*NB blocks
of the triangular T matrices, used the the reduction.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrix Q is represented as a product of (ihi-ilo) elementary
reflectors
Q = H(ilo) H(ilo+1) . . . H(ihi-1).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
exit in A(i+2:ihi,i), and tau in TAU(i).
The contents of A are illustrated by the following example, with
n = 7, ilo = 2 and ihi = 6:
on entry, on exit,
( a a a a a a a ) ( a a h h h h a )
( a a a a a a ) ( a h h h h a )
( a a a a a a ) ( h h h h h h )
( a a a a a a ) ( v2 h h h h h )
( a a a a a a ) ( v2 v3 h h h h )
( a a a a a a ) ( v2 v3 v4 h h h )
( a ) ( a )
where a denotes an element of the original matrix A, h denotes a
modified element of the upper Hessenberg matrix H, and vi denotes an
element of the vector defining H(i).
This implementation follows the hybrid algorithm and notations described in
S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
form through hybrid GPU-based computing," University of Tennessee Computer
Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
May 24, 2009.
===================================================================== */
float c_one = MAGMA_S_ONE;
float c_zero = MAGMA_S_ZERO;
magma_int_t N = n, ldda = n;
magma_int_t nh, iws;
magma_int_t nbmin, iinfo;
magma_int_t ldwork;
magma_int_t lquery;
--tau;
*info = 0;
MAGMA_S_SET2REAL( work[0], (float) n * nb );
lquery = lwork == -1;
if (n < 0) {
*info = -1;
} else if (ilo < 1 || ilo > max(1,n)) {
*info = -2;
} else if (ihi < min(ilo,n) || ihi > n) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if (lwork < max(1,n) && ! lquery) {
*info = -8;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
nh = ihi - ilo + 1;
if (nh <= 1) {
work[0] = c_one;
return *info;
}
float *da;
if (MAGMA_SUCCESS != magma_smalloc( &da, N*ldda + 2*N*nb + nb*nb )) {
return *info;
}
float *d_A = da;
float *d_work = da + (N+nb)*ldda;
float *t, *d_t;
t = (float*) malloc(nb*nb*sizeof(float));
if ( t == NULL ) {
magma_free( da );
return *info;
}
d_t = d_work + nb * ldda;
szero_nbxnb_block(nb, d_A+N*ldda, ldda);
/* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
for (i__ = 1; i__ < ilo; ++i__)
tau[i__] = c_zero;
for (i__ = max(1,ihi); i__ < n; ++i__)
tau[i__] = c_zero;
for(i__=0; i__< nb*nb; i__+=4)
t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero;
nbmin = 2;
iws = 1;
if (nb > 1 && nb < nh) {
/* Determine when to cross over from blocked to unblocked code
(last block is always handled by unblocked code) */
if (nb < nh) {
/* Determine if workspace is large enough for blocked code */
iws = n * nb;
if (lwork < iws) {
/* Not enough workspace to use optimal NB: determine the
minimum value of NB, and reduce NB or force use of
unblocked code */
nbmin = nb;
if (lwork >= n * nbmin)
nb = lwork / n;
else
nb = 1;
}
}
}
ldwork = n;
if (nb < nbmin || nb >= nh) {
/* Use unblocked code below */
i__ = ilo;
} else {
/* Use blocked code */
/* Copy the matrix to the GPU */
magma_ssetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda );
for (i__ = ilo; i__ < ihi - nb; i__ += nb) {
/* Computing MIN */
ib = min(nb, ihi - i__);
/* Reduce columns i:i+ib-1 to Hessenberg form, returning the
matrices V and T of the block reflector H = I - V*T*V'
which performs the reduction, and also the matrix Y = A*V*T */
/* Get the current panel (no need for the 1st iteration) */
magma_sgetmatrix( ihi-i__+1, ib,
d_A + (i__ - ilo)*ldda + i__ - 1, ldda,
a + (i__ - 1 )*lda + i__ - 1, lda );
magma_slahr2(ihi, i__, ib,
d_A + (i__ - ilo)*ldda,
d_A + N*ldda + 1,
a + (i__ - 1 )*(lda) , lda,
&tau[i__], t, nb, work, ldwork);
/* Copy T from the CPU to D_T on the GPU */
d_t = dT + (i__ - ilo)*nb;
magma_ssetmatrix( nb, nb, t, nb, d_t, nb );
magma_slahru(n, ihi, i__ - 1, ib,
a + (i__ - 1 )*(lda), lda,
d_A + (i__ - ilo)*ldda,
d_A + (i__ - ilo)*ldda + i__ - 1,
d_A + N*ldda, d_t, d_work);
}
}
/* Use unblocked code to reduce the rest of the matrix */
if (!(nb < nbmin || nb >= nh))
magma_sgetmatrix( n, n-i__+1,
d_A+ (i__-ilo)*ldda, ldda,
a + (i__-1)*(lda), lda );
lapackf77_sgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo);
MAGMA_S_SET2REAL( work[0], (float) iws );
magma_free( da );
free(t);
return *info;
} /* magma_sgehrd */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgehrd2 ( magma_int_t  n,
magma_int_t  ilo,
magma_int_t  ihi,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t lwork,
magma_int_t info 
)

Definition at line 14 of file sgehrd2.cpp.

References __func__, lapackf77_sgehd2(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_get_sgehrd_nb(), MAGMA_S_ONE, MAGMA_S_SET2REAL, MAGMA_S_ZERO, magma_sgetmatrix(), magma_slahr2(), magma_slahru(), magma_smalloc(), magma_ssetmatrix(), MAGMA_SUCCESS, magma_xerbla(), max, min, gbstrct_blg::N, and szero_nbxnb_block().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEHRD2 reduces a REAL general matrix A to upper Hessenberg form H by
an orthogonal similarity transformation: Q' * A * Q = H .
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
ILO (input) INTEGER
IHI (input) INTEGER
It is assumed that A is already upper triangular in rows
and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
set by a previous call to DGEBAL; otherwise they should be
set to 1 and N respectively. See Further Details.
1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the N-by-N general matrix to be reduced.
On exit, the upper triangle and the first subdiagonal of A
are overwritten with the upper Hessenberg matrix H, and the
elements below the first subdiagonal, with the array TAU,
represent the orthogonal matrix Q as a product of elementary
reflectors. See Further Details.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
TAU (output) REAL array, dimension (N-1)
The scalar factors of the elementary reflectors (see Further
Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
zero.
WORK (workspace/output) REAL array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The length of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value.
Further Details
===============
The matrix Q is represented as a product of (ihi-ilo) elementary
reflectors
Q = H(ilo) H(ilo+1) . . . H(ihi-1).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
exit in A(i+2:ihi,i), and tau in TAU(i).
The contents of A are illustrated by the following example, with
n = 7, ilo = 2 and ihi = 6:
on entry, on exit,
( a a a a a a a ) ( a a h h h h a )
( a a a a a a ) ( a h h h h a )
( a a a a a a ) ( h h h h h h )
( a a a a a a ) ( v2 h h h h h )
( a a a a a a ) ( v2 v3 h h h h )
( a a a a a a ) ( v2 v3 v4 h h h )
( a ) ( a )
where a denotes an element of the original matrix A, h denotes a
modified element of the upper Hessenberg matrix H, and vi denotes an
element of the vector defining H(i).
This implementation follows the hybrid algorithm and notations described in
S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
form through hybrid GPU-based computing," University of Tennessee Computer
Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
May 24, 2009.
===================================================================== */
float c_one = MAGMA_S_ONE;
float c_zero = MAGMA_S_ZERO;
magma_int_t N = n, ldda = n;
magma_int_t nh, iws;
magma_int_t nbmin, iinfo;
magma_int_t ldwork;
magma_int_t lquery;
--tau;
*info = 0;
MAGMA_S_SET2REAL( work[0], (float) n * nb );
lquery = *lwork == -1;
if (n < 0) {
*info = -1;
} else if (ilo < 1 || ilo > max(1,n)) {
*info = -2;
} else if (ihi < min(ilo,n) || ihi > n) {
*info = -3;
} else if (lda < max(1,n)) {
*info = -5;
} else if (*lwork < max(1,n) && ! lquery) {
*info = -8;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
nh = ihi - ilo + 1;
if (nh <= 1) {
work[0] = c_one;
return *info;
}
float *da;
if (MAGMA_SUCCESS != magma_smalloc( &da, N*ldda + 2*N*nb + nb*nb )) {
return *info;
}
float *d_A = da;
float *d_work = da + (N+nb)*ldda;
float *t, *d_t;
t = (float*) malloc( nb*nb * sizeof(float));
if ( t == NULL ) {
magma_free( da );
return *info;
}
d_t = d_work + nb * ldda;
szero_nbxnb_block(nb, d_A+N*ldda, ldda);
/* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
for (i__ = 1; i__ < ilo; ++i__)
tau[i__] = c_zero;
for (i__ = max(1,ihi); i__ < n; ++i__)
tau[i__] = c_zero;
for(i__=0; i__< nb*nb; i__+=4)
t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero;
nbmin = 2;
iws = 1;
if (nb > 1 && nb < nh) {
/* Determine when to cross over from blocked to unblocked code
(last block is always handled by unblocked code) */
if (nb < nh) {
/* Determine if workspace is large enough for blocked code */
iws = n * nb;
if (*lwork < iws) {
/* Not enough workspace to use optimal NB: determine the
minimum value of NB, and reduce NB or force use of
unblocked code */
nbmin = nb;
if (*lwork >= n * nbmin)
nb = *lwork / n;
else
nb = 1;
}
}
}
ldwork = n;
if (nb < nbmin || nb >= nh) {
/* Use unblocked code below */
i__ = ilo;
} else {
/* Use blocked code */
/* Copy the matrix to the GPU */
magma_ssetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda );
for (i__ = ilo; i__ < ihi - nb; i__ += nb) {
/* Computing MIN */
ib = min(nb, ihi - i__);
/* Reduce columns i:i+ib-1 to Hessenberg form, returning the
matrices V and T of the block reflector H = I - V*T*V'
which performs the reduction, and also the matrix Y = A*V*T */
/* Get the current panel (no need for the 1st iteration) */
magma_sgetmatrix( ihi-i__+1, ib,
d_A + (i__ - ilo)*ldda + i__ - 1, ldda,
a + (i__ - 1 )*lda + i__ - 1, lda );
magma_slahr2(ihi, i__, ib,
d_A + (i__ - ilo)*ldda,
d_A + N*ldda + 1,
a + (i__ - 1 )*(lda) , lda,
&tau[i__], t, nb, work, ldwork);
/* Copy T from the CPU to D_T on the GPU */
magma_ssetmatrix( nb, nb, t, nb, d_t, nb );
magma_slahru(n, ihi, i__ - 1, ib,
a + (i__ - 1 )*(lda), lda,
d_A + (i__ - ilo)*ldda,
d_A + (i__ - ilo)*ldda + i__ - 1,
d_A + N*ldda, d_t, d_work);
}
}
/* Use unblocked code to reduce the rest of the matrix */
if (!(nb < nbmin || nb >= nh))
magma_sgetmatrix( n, n-i__+1,
d_A+ (i__-ilo)*ldda, ldda,
a + (i__-1)*(lda), lda );
lapackf77_sgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo);
MAGMA_S_SET2REAL( work[0], (float) iws );
magma_free( da );
free(t);
return *info;
} /* magma_sgehrd2 */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgelqf ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgelqf.cpp.

References __func__, dA, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgelqf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf2_gpu(), magma_sgetmatrix(), magma_smalloc(), magma_ssetmatrix(), MAGMA_SUCCESS, magma_xerbla(), magmablas_sinplace_transpose(), magmablas_stranspose2(), max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGELQF computes an LQ factorization of a REAL M-by-N matrix A:
A = L * Q.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and below the diagonal of the array
contain the m-by-min(m,n) lower trapezoidal matrix L (L is
lower triangular if m <= n); the elements above the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of elementary reflectors (see Further Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,M).
For optimum performance LWORK >= M*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
if INFO = -10 internal GPU memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
float *dA, *dAT;
float c_one = MAGMA_S_ONE;
magma_int_t maxm, maxn, maxdim, nb;
magma_int_t iinfo, ldda;
long int lquery;
/* Function Body */
*info = 0;
work[0] = MAGMA_S_MAKE( (float)(m*nb), 0 );
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,m) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (min(m, n) == 0) {
work[0] = c_one;
return *info;
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
if (maxdim*maxdim < 2*maxm*maxn)
{
ldda = maxdim;
if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) {
return *info;
}
magma_ssetmatrix( m, n, a, lda, dA, ldda );
dAT = dA;
magmablas_sinplace_transpose( dAT, ldda, ldda );
}
else
{
ldda = maxn;
if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) {
return *info;
}
magma_ssetmatrix( m, n, a, lda, dA, maxm );
dAT = dA + maxn * maxm;
magmablas_stranspose2( dAT, ldda, dA, maxm, m, n );
}
magma_sgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);
if (maxdim*maxdim< 2*maxm*maxn){
magmablas_sinplace_transpose( dAT, ldda, ldda );
magma_sgetmatrix( m, n, dA, ldda, a, lda );
} else {
magmablas_stranspose2( dA, maxm, dAT, ldda, n, m );
magma_sgetmatrix( m, n, dA, maxm, a, lda );
}
magma_free( dA );
return *info;
} /* magma_sgelqf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgelqf_gpu ( magma_int_t  m,
magma_int_t  n,
float *  dA,
magma_int_t  ldda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgelqf_gpu.cpp.

References __func__, dA, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgelqf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf2_gpu(), magma_smalloc(), MAGMA_SUCCESS, magma_xerbla(), magmablas_sinplace_transpose(), magmablas_stranspose2(), max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGELQF computes an LQ factorization of a REAL M-by-N matrix dA:
dA = L * Q.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) REAL array on the GPU, dimension (LDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and below the diagonal of the array
contain the m-by-min(m,n) lower trapezoidal matrix L (L is
lower triangular if m <= n); the elements above the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of elementary reflectors (see Further Details).
LDA (input) INTEGER
The leading dimension of the array dA. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,M).
For optimum performance LWORK >= M*NB, where NB is the
optimal blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
if INFO = -10 internal GPU memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( dA+(a_2)*(lda) + (a_1))
float *dAT;
float c_one = MAGMA_S_ONE;
magma_int_t maxm, maxn, maxdim, nb;
magma_int_t iinfo;
long int lquery;
*info = 0;
work[0] = MAGMA_S_MAKE( (float)(m*nb), 0 );
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,m) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (min(m, n) == 0) {
work[0] = c_one;
return *info;
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
int ldat = maxn;
dAT = dA;
if ((m == n) && (m % 32 == 0) && (lda%32 == 0)){
ldat = lda;
magmablas_sinplace_transpose( dAT, lda, maxm );
}
else {
if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn ) ){
return *info;
}
magmablas_stranspose2( dAT, ldat, dA, lda, m, n );
}
magma_sgeqrf2_gpu(n, m, dAT, ldat, tau, &iinfo);
if ((m == n) && (m % 32 == 0) && (lda%32 == 0)){
magmablas_sinplace_transpose( dAT, ldat, maxm );
}
else {
magmablas_stranspose2( dA, lda, dAT, ldat, n, m );
magma_free( dAT );
}
return *info;
} /* magma_sgelqf_gpu */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgels3_gpu ( char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
float *  dB,
magma_int_t  lddb,
float *  hwork,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgels3_gpu.cpp.

References __func__, dT, MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_get_sgeqrf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf3_gpu(), magma_sgeqrs3_gpu(), magma_smalloc(), MAGMA_SUCCESS, magma_xerbla(), max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the overdetermined, least squares problem
min || A*X - C ||
using the QR factorization A.
The underdetermined problem (m < n) is not currently handled.
Arguments
=========
TRANS (input) CHARACTER*1
= 'N': the linear system involves A.
Only trans='N' is currently handled.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, A is overwritten by details of its QR
factorization as returned by SGEQRF3.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) REAL array, dimension MAX(1,LWORK).
On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array HWORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_sgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the HWORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
float *dT, *tau;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. );
*info = 0;
/* For now, N is the only case working */
if ( (trans != 'N') && (trans != 'n' ) )
*info = -1;
else if (m < 0)
*info = -2;
else if (n < 0 || m < n) /* LQ is not handle for now*/
*info = -3;
else if (nrhs < 0)
*info = -4;
else if (ldda < max(1,m))
*info = -6;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
return *info;
}
/*
* Allocate temporary buffers
*/
int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
if (nb < nrhs)
ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
if (MAGMA_SUCCESS != magma_smalloc( &dT, ldtwork )) {
return *info;
}
tau = (float*) malloc( k * sizeof(float) );
if( tau == NULL ) {
magma_free( dT );
return *info;
}
magma_sgeqrf3_gpu( m, n, dA, ldda, tau, dT, info );
if ( *info == 0 ) {
magma_sgeqrs3_gpu( m, n, nrhs,
dA, ldda, tau, dT,
dB, lddb, hwork, lwork, info );
}
magma_free( dT );
free(tau);
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgels_gpu ( char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
float *  dB,
magma_int_t  lddb,
float *  hwork,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgels_gpu.cpp.

References __func__, dT, MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_get_sgeqrf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf_gpu(), magma_sgeqrs_gpu(), magma_smalloc(), MAGMA_SUCCESS, magma_xerbla(), max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the overdetermined, least squares problem
min || A*X - C ||
using the QR factorization A.
The underdetermined problem (m < n) is not currently handled.
Arguments
=========
TRANS (input) CHARACTER*1
= 'N': the linear system involves A.
Only trans='N' is currently handled.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, A is overwritten by details of its QR
factorization as returned by SGEQRF.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) REAL array, dimension MAX(1,LWORK).
On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array HWORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_sgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the HWORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
float *dT, *tau;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. );
*info = 0;
/* For now, N is the only case working */
if ( (trans != 'N') && (trans != 'n' ) )
*info = -1;
else if (m < 0)
*info = -2;
else if (n < 0 || m < n) /* LQ is not handle for now*/
*info = -3;
else if (nrhs < 0)
*info = -4;
else if (ldda < max(1,m))
*info = -6;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
return *info;
}
/*
* Allocate temporary buffers
*/
int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
if (nb < nrhs)
ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
if (MAGMA_SUCCESS != magma_smalloc( &dT, ldtwork )) {
return *info;
}
tau = (float*) malloc( k * sizeof(float) );
if( tau == NULL ) {
magma_free( dT );
return *info;
}
magma_sgeqrf_gpu( m, n, dA, ldda, tau, dT, info );
if ( *info == 0 ) {
magma_sgeqrs_gpu( m, n, nrhs,
dA, ldda, tau, dT,
dB, lddb, hwork, lwork, info );
}
magma_free( dT );
free(tau);
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqlf ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgeqlf.cpp.

References __func__, a_ref, da_ref, dwork, lapackf77_sgeqlf(), lapackf77_slarft(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgeqlf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaBackward, MagmaBackwardStr, MagmaColumnwise, MagmaColumnwiseStr, MagmaLeft, MagmaLower, MagmaTrans, max, min, spanel_to_q(), and sq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQLF computes a QL factorization of a REAL M-by-N matrix A:
A = Q * L.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, if m >= n, the lower triangle of the subarray
A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L;
if m <= n, the elements on and below the (n-m)-th
superdiagonal contain the M-by-N lower trapezoidal matrix L;
the remaining elements, with the array TAU, represent the
orthogonal matrix Q as a product of elementary reflectors
(see Further Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= max(1,N).
For optimum performance LWORK >= N*NB, where NB can be obtained
through magma_get_sgeqlf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(k) . . . H(2) H(1), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in
A(1:m-k+i-1,n-k+i), and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
float *da, *dwork;
float c_one = MAGMA_S_ONE;
magma_int_t i, k, lddwork, old_i, old_ib, nb;
magma_int_t rows, cols;
magma_int_t ib, ki, kk, mu, nu, iinfo, ldda;
long int lquery;
*info = 0;
lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
}
if (*info == 0) {
k = min(m,n);
if (k == 0)
work[0] = c_one;
else {
work[0] = MAGMA_S_MAKE( n*nb, 0 );
}
if (lwork < max(1,n) && ! lquery)
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Quick return if possible */
if (k == 0)
return *info;
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
if (MAGMA_SUCCESS != magma_smalloc( &da, (n)*ldda + nb*lddwork )) {
return *info;
}
dwork = da + ldda*(n);
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially.
The last kk columns are handled by the block method.
First, copy the matrix on the GPU except the last kk columns */
magma_ssetmatrix_async( (m), (n-nb),
a_ref(0, 0), lda,
da_ref(0, 0), ldda, stream[0] );
ki = ((k - nb - 1) / nb) * nb;
kk = min(k, ki + nb);
for (i = k - kk + ki; i >= k -kk; i -= nb) {
ib = min(k-i,nb);
if (i< k - kk + ki){
/* 1. Copy asynchronously the current panel to the CPU.
2. Copy asynchronously the submatrix below the panel
to the CPU) */
rows = m - k + i + ib;
da_ref(0, n-k+i), ldda,
a_ref(0, n-k+i), lda, stream[1] );
magma_sgetmatrix_async( (m-rows), ib,
da_ref(rows, n-k+i), ldda,
a_ref(rows, n-k+i), lda, stream[0] );
/* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
two steps - implementing the lookahead techniques.
This is the main update from the lookahead techniques. */
rows = m - k + old_i + old_ib;
cols = n - k + old_i - old_ib;
rows, cols, old_ib,
da_ref(0, cols+old_ib), ldda, dwork, lddwork,
da_ref(0, 0 ), ldda, dwork+old_ib, lddwork);
}
magma_queue_sync( stream[1] );
/* Compute the QL factorization of the current block
A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */
rows = m - k + i + ib;
cols = n - k + i;
lapackf77_sgeqlf(&rows,&ib, a_ref(0,cols), &lda, tau+i, work, &lwork, &iinfo);
if (cols > 0) {
/* Form the triangular factor of the block reflector
H = H(i+ib-1) . . . H(i+1) H(i) */
&rows, &ib,
a_ref(0, cols), &lda, tau + i, work, &ib);
spanel_to_q( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib);
magma_ssetmatrix( rows, ib,
a_ref(0,cols), lda,
da_ref(0,cols), ldda );
sq_to_panel( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib);
// Send the triangular part on the GPU
magma_ssetmatrix( ib, ib, work, ib, dwork, lddwork );
/* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
two steps - implementing the lookahead techniques.
This is the update of first ib columns. */
if (i-ib >= k -kk)
rows, ib, ib,
da_ref(0, cols), ldda, dwork, lddwork,
da_ref(0,cols-ib), ldda, dwork+ib, lddwork);
else{
rows, cols, ib,
da_ref(0, cols), ldda, dwork, lddwork,
da_ref(0, 0 ), ldda, dwork+ib, lddwork);
}
old_i = i;
old_ib = ib;
}
}
mu = m - k + i + nb;
nu = n - k + i + nb;
magma_sgetmatrix( m, nu, da_ref(0,0), ldda, a_ref(0,0), lda );
} else {
mu = m;
nu = n;
}
/* Use unblocked code to factor the last or only block */
if (mu > 0 && nu > 0)
lapackf77_sgeqlf(&mu, &nu, a_ref(0,0), &lda, tau, work, &lwork, &iinfo);
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_sgeqlf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqp3 ( magma_int_t m,
magma_int_t n,
float *  a,
magma_int_t lda,
magma_int_t jpvt,
float *  tau,
float *  work,
magma_int_t lwork,
magma_int_t info 
)
magma_int_t magma_sgeqrf ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgeqrf.cpp.

References __func__, a_ref, da_ref, dwork, lapackf77_sgeqrf(), lapackf77_slarft(), magma_free(), magma_get_sgeqrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf4(), magma_sgeqrf_ooc(), magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, MagmaUpper, max, min, spanel_to_q(), and sq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF computes a QR factorization of a REAL M-by-N matrix A:
A = Q * R. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
float *da, *dwork;
float c_one = MAGMA_S_ONE;
int i, k, lddwork, old_i, old_ib;
int ib, ldda;
/* Function Body */
*info = 0;
int nb = magma_get_sgeqrf_nb(min(m, n));
int lwkopt = n * nb;
work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return *info;
}
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
magma_int_t num_gpus = 1;
char * num_gpus_char = getenv("MAGMA_NUM_GPUS");
if (num_gpus_char != NULL ) num_gpus = atoi(num_gpus_char);
if( num_gpus > 1 ) {
/* call multiple-GPU interface */
return magma_sgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info);
}
if (MAGMA_SUCCESS != magma_smalloc( &da, (n)*ldda + nb*lddwork )) {
/* Switch to the "out-of-core" (out of GPU-memory) version */
return magma_sgeqrf_ooc(m, n, a, lda, tau, work, lwork, info);
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
dwork = da + ldda*(n);
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
magma_ssetmatrix_async( (m), (n-nb),
a_ref(0,nb), lda,
da_ref(0,nb), ldda, stream[0] );
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
if (i>0){
da_ref(i,i), ldda,
a_ref(i,i), lda, stream[1] );
da_ref(0,i), ldda,
a_ref(0,i), lda, stream[0] );
/* Apply H' to A(i:m,i+2*ib:n) from the left */
m-old_i, n-old_i-2*old_ib, old_ib,
da_ref(old_i, old_i), ldda, dwork, lddwork,
da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
}
magma_queue_sync( stream[1] );
int rows = m-i;
lapackf77_sgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib, a_ref(i,i), &lda, tau+i, work, &ib);
spanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib);
magma_ssetmatrix( rows, ib, a_ref(i,i), lda, da_ref(i,i), ldda );
sq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib);
if (i + ib < n) {
magma_ssetmatrix( ib, ib, work, ib, dwork, lddwork );
if (i+ib < k-nb)
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
else
rows, n-i-ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
if (i!=0)
magma_sgetmatrix( m, ib, da_ref(0,i), ldda, a_ref(0,i), lda );
int rows = m-i;
lapackf77_sgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_sgeqrf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf2_gpu ( magma_int_t  m,
magma_int_t  n,
float *  dA,
magma_int_t  ldda,
float *  tau,
magma_int_t info 
)

Definition at line 14 of file sgeqrf2_gpu.cpp.

References __func__, dwork, hwork, lapackf77_sgeqrf(), lapackf77_slarft(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_free_host(), magma_get_sgeqrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc(), magma_smalloc_host(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, MagmaUpper, max, min, spanel_to_q(), sq_to_panel(), codegen::work, and work_ref.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF computes a QR factorization of a real M-by-N matrix A:
A = Q * R.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) REAL array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array dA. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1))
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
float *dwork;
float *work;
int i, k, ldwork, lddwork, old_i, old_ib, rows;
int nbmin, nx, ib, nb;
int lhwork, lwork;
/* Function Body */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = min(m,n);
if (k == 0)
return *info;
lwork = (m+n) * nb;
lhwork = lwork - (m)*nb;
if (MAGMA_SUCCESS != magma_smalloc( &dwork, (n)*nb )) {
return *info;
}
if (MAGMA_SUCCESS != magma_smalloc_host( &work, lwork )) {
magma_free( dwork );
return *info;
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
nbmin = 2;
nx = nb;
ldwork = m;
lddwork= n;
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb) {
ib = min(k-i, nb);
rows = m -i;
dA(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
m-old_i, n-old_i-2*old_ib, old_ib,
dA(old_i, old_i ), ldda, dwork, lddwork,
dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
magma_ssetmatrix_async( old_ib, old_ib,
work_ref(old_i), ldwork,
dA(old_i, old_i), ldda, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
magma_ssetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda );
sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
if (i + ib < n) {
magma_ssetmatrix( ib, ib, hwork, ib, dwork, lddwork );
if (i+nb < k-nx)
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
dA(i, i ), ldda, dwork, lddwork,
dA(i, i+ib), ldda, dwork+ib, lddwork);
else {
rows, n-i-ib, ib,
dA(i, i ), ldda, dwork, lddwork,
dA(i, i+ib), ldda, dwork+ib, lddwork);
work_ref(i), ldwork,
dA(i,i), ldda );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
magma_free( dwork );
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_sgetmatrix( rows, ib, dA(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_ssetmatrix( rows, ib, work, rows, dA(i, i), ldda );
}
magma_free_host( work );
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
return *info;
} /* magma_sgeqrf2_gpu */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf2_mgpu ( magma_int_t  num_gpus,
magma_int_t  m,
magma_int_t  n,
float **  dlA,
magma_int_t  ldda,
float *  tau,
magma_int_t info 
)

Definition at line 125 of file sgeqrf_mgpu-trace.cpp.

References __func__, core_cpu_event_end, core_cpu_event_start, core_gpu_event_end, core_gpu_event_start, core_log_event, dump_trace(), dwork, get_current_cpu_time(), hwrk_ref, lapackf77_sgeqrf(), lapackf77_slarft(), lhwrk, magma_device_sync(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_event_create(), magma_event_record(), magma_free(), magma_get_sgeqrf_nb(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc(), magma_smalloc_host(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), magmablas_sgetmatrix_1D_bcyclic(), magmablas_ssetmatrix_1D_bcyclic(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, MagmaUpper, max, min, spanel_to_q(), and sq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A:
A = Q * R. This is a GPU interface of the routine.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) REAL array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array dA. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1))
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
#define hwrk_ref(a_1) ( local_work + (a_1))
#define lhwrk ( local_work + (nb)*(m))
float *dwork[4], *panel[4], *local_work;
magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows;
magma_int_t nbmin, nx, ib, nb;
magma_int_t lhwork, lwork;
magma_int_t cdevice;
magma_getdevice(&cdevice);
float ctime, dtime;
int panel_gpunum=-1, i_local, n_local[4], la_gpu, displacement;
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = min(m,n);
if (k == 0)
return *info;
displacement = n * nb;
lwork = max((m+n+64) * nb,n*m+n*nb);
lhwork = lwork - (m)*nb;
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) {
return *info;
}
}
/* Set the number of local n for each GPU */
for(i=0; i<num_gpus; i++){
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
}
if (MAGMA_SUCCESS != magma_smalloc_host( &local_work, lwork )) {
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_free( dwork[i] );
}
return *info;
}
static cudaStream_t streaml[4][2];
cudaEvent_t start[4], stop[4][10];
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_queue_create( &streaml[i][0] );
magma_queue_create( &streaml[i][1] );
magma_event_create( &start[i] );
for(j=0; j<10; j++)
magma_event_create( &stop[i][j] );
magma_event_record( start[i], 0 );
}
core_cpu_event_end(num_gpus);
core_log_event(0x666666, num_gpus);
for(j=0; j<num_gpus; j++){
magma_event_record(stop[j][0], 0);
magma_event_record(stop[j][1], 0);
core_gpu_event_start(j, start[j], stop[j][0]);
core_gpu_event_end(j, start[j], stop[j][1]);
core_log_event(0x666666, j);
}
nbmin = 2;
nx = nb;
ldwork = m;
lddwork= n;
// magmablasSetKernelStream(streaml[0][0]);
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb)
{
/* Set the GPU number that holds the current panel */
panel_gpunum = (i/nb)%num_gpus;
/* Set the local index where the current panel is */
i_local = i/(nb*num_gpus)*nb;
ib = min(k-i, nb);
rows = m -i;
/* Send current panel to the CPU */
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
dlA(panel_gpunum, i, i_local), ldda,
hwrk_ref(i), ldwork, streaml[panel_gpunum][1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead
application to the trailing matrix */
la_gpu = panel_gpunum;
/* only the GPU that has next panel is done look-ahead */
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
magma_event_record(stop[la_gpu][0], 0);
m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, old_i, i_local+old_ib), ldda,
dwork[la_gpu]+old_ib, lddwork);
magma_event_record(stop[la_gpu][1], 0);
la_gpu = ((i-nb)/nb)%num_gpus;
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
magma_ssetmatrix_async( old_ib, old_ib,
hwrk_ref(old_i), ldwork,
panel[la_gpu], ldda, streaml[la_gpu][0] );
}
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
magma_queue_sync( streaml[panel_gpunum][1] );
lapackf77_sgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info);
// Form the triangular factor of the block reflector
// H = H(i) H(i+1) . . . H(i+ib-1)
&rows, &ib,
hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib);
spanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );
core_cpu_event_end(num_gpus);
core_log_event(0x006680, num_gpus);
// Send the current panel back to the GPUs
// Has to be done with asynchronous copies
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
if (j == panel_gpunum)
panel[j] = dlA(j, i, i_local);
else
panel[j] = dwork[j]+displacement;
hwrk_ref(i), ldwork,
panel[j], ldda, streaml[j][0] );
}
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
magma_queue_sync( streaml[j][0] );
}
core_cpu_event_end(num_gpus);
core_log_event(0xDD0000, num_gpus);
//=================== take the values of all counters
/* Restore the panel */
sq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );
core_cpu_event_end(num_gpus);
core_log_event(0x006680, num_gpus);
if (i>0){
for(j=0; j<num_gpus; j++){
core_gpu_event_start(j, start[j], stop[j][2]);
core_gpu_event_end(j, start[j], stop[j][3]);
core_log_event(0x880000, j);
core_gpu_event_start(j, start[j], stop[j][3]);
core_gpu_event_end(j, start[j], stop[j][0]);
core_log_event(0xDD0000, j);
}
}
magma_setdevice(panel_gpunum);
core_gpu_event_start(panel_gpunum, start[panel_gpunum], stop[panel_gpunum][0]);
core_gpu_event_end(panel_gpunum, start[panel_gpunum], stop[panel_gpunum][1]);
core_log_event(0x660000, panel_gpunum);
if (i + ib < n)
{
/* Send the T matrix to the GPU.
Has to be done with asynchronous copies */
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
lhwrk, ib,
dwork[j], lddwork, streaml[j][0] );
}
if (i+nb < k-nx)
{
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left;
This is update for the next panel; part of the look-ahead */
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
//magma_queue_sync( streaml[j][0] );
magma_event_record(stop[j][2], 0);
if (j==la_gpu)
rows, ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork);
else if (j<=panel_gpunum)
rows, n_local[j]-i_local-ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork);
else
rows, n_local[j]-i_local, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork);
magma_event_record(stop[j][3], 0);
}
}
else {
/* do the entire update as we exit and there would be no lookahead */
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
rows, n_local[la_gpu]-i_loc, ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork);
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
hwrk_ref(i), ldwork,
dlA(panel_gpunum, i, i_local), ldda );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
magma_free( dwork[j] );
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
lhwork = lwork - rows*ib;
panel_gpunum = (panel_gpunum+1)%num_gpus;
int i_loc = (i)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
if (i == 0) {
magmablas_sgetmatrix_1D_bcyclic(m, n, dlA, ldda, lhwrk, m, num_gpus, nb);
} else {
magma_sgetmatrix( rows, ib,
dlA(panel_gpunum, i, i_loc), ldda,
lhwrk, rows );
}
lhwork = lwork - rows*ib;
lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info);
if (i == 0) {
magmablas_ssetmatrix_1D_bcyclic(m, n, lhwrk, m, dlA, ldda, num_gpus, nb);
} else {
magma_ssetmatrix( rows, ib,
lhwrk, rows,
dlA(panel_gpunum, i, i_loc), ldda );
}
}
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_queue_destroy( streaml[i][0] );
magma_queue_destroy( streaml[i][1] );
}
magma_setdevice(cdevice);
dump_trace(num_gpus+1);
return *info;
} /* magma_sgeqrf2_mgpu */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf3_gpu ( magma_int_t  m,
magma_int_t  n,
float *  dA,
magma_int_t  ldda,
float *  tau,
float *  dT,
magma_int_t info 
)

Definition at line 38 of file sgeqrf3_gpu.cpp.

References __func__, a_ref, d_ref, dd_ref, hwork, lapackf77_sgeqrf(), lapackf77_slarft(), MAGMA_ERR_HOST_ALLOC, magma_free_host(), magma_get_sgeqrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc_host(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, max, min, ssplit_diag_block3(), t_ref, codegen::work, and work_ref.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF3 computes a QR factorization of a REAL M-by-N matrix A:
A = Q * R. This version stores the triangular matrices T used in
the block QR factorization so that Q can be applied directly (i.e.,
without being recomputed) later. As a result, the application
of Q is much faster. The upper triangular matrices for V have 0s
in them and the corresponding parts of the upper triangular R are
stored separately.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
dT (workspace/output) REAL array on the GPU,
dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
It starts with MIN(M,N)*NB block that store the triangular T
matrices, followed by the MIN(M,N)*NB block of the diagonal
matrices for the R matrix. The rest of the array is used as workspace.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define t_ref(a_1) (dT+(a_1)*nb)
#define d_ref(a_1) (dT+(minmn+(a_1))*nb)
#define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb)
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
magma_int_t ib, nb;
magma_int_t ldwork, lddwork, lwork, lhwork;
float *work, *ut;
/* check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = minmn = min(m,n);
if (k == 0)
return *info;
lwork = (m + n + nb)*nb;
lhwork = lwork - m*nb;
if (MAGMA_SUCCESS != magma_smalloc_host( &work, lwork )) {
return *info;
}
ut = hwork+nb*(n);
memset( ut, 0, nb*nb*sizeof(float));
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
ldwork = m;
lddwork= n;
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
rows = m -i;
a_ref(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
m-old_i, cols, old_ib,
a_ref(old_i, old_i ), ldda, t_ref(old_i), nb,
a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork);
/* store the diagonal */
magma_ssetmatrix_async( old_ib, old_ib,
ut, old_ib,
d_ref(old_i), old_ib, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
/* Put 0s in the upper triangular part of a panel (and 1s on the
diagonal); copy the upper triangular in ut. */
magma_queue_sync( stream[0] );
ssplit_diag_block3(ib, work_ref(i), ldwork, ut);
magma_ssetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );
if (i + ib < n) {
/* Send the triangular factor T to the GPU */
magma_ssetmatrix( ib, ib, hwork, ib, t_ref(i), nb );
if (i+nb < k-nb){
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
}
else {
cols = n-i-ib;
rows, cols, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
/* Fix the diagonal block */
magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_sgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_ssetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free_host( work );
return *info;
/* End of MAGMA_SGEQRF */
} /* magma_sgeqrf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf4 ( magma_int_t  num_gpus,
magma_int_t  m,
magma_int_t  n,
float *  a,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgeqrf-v4.cpp.

References __func__, lapackf77_sgeqrf(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgeqrf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_setdevice(), magma_sgeqrf2_mgpu(), magma_smalloc(), MAGMA_SUCCESS, magma_xerbla(), magmablas_sgetmatrix_1D_bcyclic(), magmablas_ssetmatrix_1D_bcyclic(), max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF4 computes a QR factorization of a REAL M-by-N matrix A:
A = Q * R using multiple GPUs. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
Arguments
=========
NUM_GPUS
(input) INTEGER
The number of GPUs to be used for the factorization.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
float *da[4];
float c_one = MAGMA_S_ONE;
int i, k, ldda;
*info = 0;
int nb = magma_get_sgeqrf_nb(min(m, n));
int lwkopt = n * nb;
work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 );
long int lquery = (lwork == -1);
if (num_gpus <0 || num_gpus > 4) {
*info = -1;
} else if (m < 0) {
*info = -2;
} else if (n < 0) {
*info = -3;
} else if (lda < max(1,m)) {
*info = -5;
} else if (lwork < max(1,n) && ! lquery) {
*info = -8;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return *info;
}
ldda = ((m+31)/32)*32;
magma_int_t n_local[4];
for(i=0; i<num_gpus; i++){
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
// TODO on failure, free previously allocated memory
if (MAGMA_SUCCESS != magma_smalloc( &da[i], ldda*n_local[i] )) {
return *info;
}
}
if (m > nb && n > nb) {
/* Copy the matrix to the GPUs in 1D block cyclic distribution */
magmablas_ssetmatrix_1D_bcyclic(m, n, a, lda, da, ldda, num_gpus, nb);
/* Factor using the GPU interface */
magma_sgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info);
/* Copy the matrix back from the GPUs to the CPU */
magmablas_sgetmatrix_1D_bcyclic(m, n, da, ldda, a, lda, num_gpus, nb);
} else {
lapackf77_sgeqrf(&m, &n, a, &lda, tau, work, &lwork, info);
}
/* Free the allocated GPU memory */
for(i=0; i<num_gpus; i++){
magma_free( da[i] );
}
return *info;
} /* magma_sgeqrf4 */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf_gpu ( magma_int_t  m,
magma_int_t  n,
float *  dA,
magma_int_t  ldda,
float *  tau,
float *  dT,
magma_int_t info 
)

Definition at line 41 of file sgeqrf_gpu.cpp.

References __func__, a_ref, d_ref, dd_ref, hwork, lapackf77_sgeqrf(), lapackf77_slarft(), MAGMA_ERR_HOST_ALLOC, magma_free_host(), magma_get_sgeqrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_sgetmatrix(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc_host(), magma_ssetmatrix(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, max, min, ssplit_diag_block(), t_ref, codegen::work, and work_ref.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF computes a QR factorization of a REAL M-by-N matrix A:
A = Q * R. This version stores the triangular matrices used in
the factorization so that they can be applied directly (i.e.,
without being recomputed) later. As a result, the application
of Q is much faster.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
dT (workspace/output) REAL array on the GPU,
dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
It starts with MIN(M,N)*NB block that store the triangular T
matrices, followed by the MIN(M,N)*NB block of the diagonal
inverses for the R matrix. The rest of the array is used as workspace.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define t_ref(a_1) (dT+(a_1)*nb)
#define d_ref(a_1) (dT+(minmn+(a_1))*nb)
#define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb)
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
magma_int_t ib, nb;
magma_int_t ldwork, lddwork, lwork, lhwork;
float *work, *ut;
/* check arguments */
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = minmn = min(m,n);
if (k == 0)
return *info;
lwork = (m + n + nb)*nb;
lhwork = lwork - m*nb;
if (MAGMA_SUCCESS != magma_smalloc_host( &work, lwork )) {
return *info;
}
ut = hwork+nb*(n);
memset( ut, 0, nb*nb*sizeof(float));
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
ldwork = m;
lddwork= n;
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
rows = m -i;
a_ref(i,i), ldda,
work_ref(i), ldwork, stream[1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
m-old_i, cols, old_ib,
a_ref(old_i, old_i ), ldda, t_ref(old_i), nb,
a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork);
/* store the diagonal */
magma_ssetmatrix_async( old_ib, old_ib,
ut, old_ib,
d_ref(old_i), old_ib, stream[0] );
}
magma_queue_sync( stream[1] );
lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
&rows, &ib,
work_ref(i), &ldwork, tau+i, hwork, &ib);
/* Put 0s in the upper triangular part of a panel (and 1s on the
diagonal); copy the upper triangular in ut and invert it */
magma_queue_sync( stream[0] );
ssplit_diag_block(ib, work_ref(i), ldwork, ut);
magma_ssetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );
if (i + ib < n) {
/* Send the triangular factor T to the GPU */
magma_ssetmatrix( ib, ib, hwork, ib, t_ref(i), nb );
if (i+nb < k-nb){
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
rows, ib, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
}
else {
cols = n-i-ib;
rows, cols, ib,
a_ref(i, i ), ldda, t_ref(i), nb,
a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
/* Fix the diagonal block */
magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_sgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
lhwork = lwork - rows*ib;
lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
magma_ssetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
}
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free_host( work );
return *info;
/* End of MAGMA_SGEQRF */
} /* magma_sgeqrf */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrf_ooc ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
float *  tau,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgeqrf_ooc.cpp.

References __func__, a_ref, da_ref, dwork, lapackf77_slarft(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgeqrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sgeqrf(), magma_sgeqrf2_gpu(), magma_sgetmatrix_async(), magma_slarfb_gpu(), magma_smalloc(), magma_ssetmatrix_async(), MAGMA_SUCCESS, magma_xerbla(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, MagmaUpper, max, min, spanel_to_q(), and sq_to_panel().

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGEQRF_OOC computes a QR factorization of a REAL M-by-N matrix A:
A = Q * R. This version does not require work space on the GPU
passed as input. GPU memory is allocated in the routine.
This is an out-of-core (ooc) version that is similar to magma_sgeqrf but
the difference is that this version can use a GPU even if the matrix
does not fit into the GPU memory at once.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
TAU (output) REAL array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
WORK (workspace/output) REAL array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
Higher performance is achieved if WORK is in pinned memory, e.g.
allocated using magma_malloc_host.
LWORK (input) INTEGER
The dimension of the array WORK. LWORK >= N*NB,
where NB can be obtained through magma_get_sgeqrf_nb(M).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
float *da, *dwork;
float c_one = MAGMA_S_ONE;
int k, lddwork, ldda;
*info = 0;
int nb = magma_get_sgeqrf_nb(min(m, n));
int lwkopt = n * nb;
work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 );
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,m)) {
*info = -4;
} else if (lwork < max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
/* Check how much memory do we have */
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
magma_int_t IB, NB = (magma_int_t)(0.8*totalMem/m);
NB = (NB / nb) * nb;
if (NB >= n)
return magma_sgeqrf(m, n, a, lda, tau, work, lwork, info);
k = min(m,n);
if (k == 0) {
work[0] = c_one;
return *info;
}
lddwork = ((NB+31)/32)*32+nb;
ldda = ((m+31)/32)*32;
if (MAGMA_SUCCESS != magma_smalloc( &da, (NB + nb)*ldda + nb*lddwork )) {
return *info;
}
static cudaStream_t stream[2];
magma_queue_create( &stream[0] );
magma_queue_create( &stream[1] );
// magmablasSetKernelStream(stream[1]);
float *ptr = da + ldda * NB;
dwork = da + ldda*(NB + nb);
/* start the main loop over the blocks that fit in the GPU memory */
for(int i=0; i<n; i+=NB)
{
IB = min(n-i, NB);
//printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB);
/* 1. Copy the next part of the matrix to the GPU */
a_ref(0,i), lda,
da_ref(0,0), ldda, stream[0] );
magma_queue_sync( stream[0] );
/* 2. Update it with the previous transformations */
for(int j=0; j<min(i,k); j+=nb)
{
magma_int_t ib = min(k-j, nb);
/* Get a panel in ptr. */
// 1. Form the triangular factor of the block reflector
// 2. Send it to the GPU.
// 3. Put 0s in the upper triangular part of V.
// 4. Send V to the GPU in ptr.
// 5. Update the matrix.
// 6. Restore the upper part of V.
int rows = m-j;
&rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib);
work, ib,
dwork, lddwork, stream[1] );
spanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
a_ref(j,j), lda,
ptr, rows, stream[1] );
magma_queue_sync( stream[1] );
rows, IB, ib,
ptr, rows, dwork, lddwork,
da_ref(j, 0), ldda, dwork+ib, lddwork);
sq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
}
/* 3. Do a QR on the current part */
if (i<k)
magma_sgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info);
/* 4. Copy the current part back to the CPU */
da_ref(0,0), ldda,
a_ref(0,i), lda, stream[0] );
}
magma_queue_sync( stream[0] );
magma_queue_destroy( stream[0] );
magma_queue_destroy( stream[1] );
magma_free( da );
return *info;
} /* magma_sgeqrf_ooc */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrs3_gpu ( magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
float *  tau,
float *  dT,
float *  dB,
magma_int_t  lddb,
float *  hwork,
magma_int_t  lhwork,
magma_int_t info 
)

Definition at line 14 of file sgeqrs3_gpu.cpp.

References __func__, a_ref, d_ref, magma_get_sgeqrf_nb(), MAGMA_S_MAKE, MAGMA_S_ONE, magma_sormqr_gpu(), magma_strsm(), magma_strsv(), magma_xerbla(), magmablas_sswapdblk(), MagmaLeft, MagmaNonUnit, MagmaNoTrans, MagmaTrans, MagmaUpper, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the least squares problem
min || A*X - C ||
using the QR factorization A = Q*R computed by SGEQRF3_GPU.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input) REAL array on the GPU, dimension (LDDA,N)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,n, as returned by
SGEQRF3_GPU in the first n columns of its array argument A.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
TAU (input) REAL array, dimension (N)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by MAGMA_SGEQRF_GPU.
DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
DT (input) REAL array that is the output (the 6th argument)
of magma_sgeqrf_gpu of size
2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
The array starts with a block of size MIN(M,N)*NB that stores
the triangular T matrices used in the QR factorization,
followed by MIN(M,N)*NB block storing the diagonal block
matrices for the R matrix, followed by work space of size
((N+31)/32*32 )* MAX(NB, NRHS).
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) REAL array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_sgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the WORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define d_ref(a_1) (dT+(lddwork+(a_1))*nb)
float c_one = MAGMA_S_ONE;
magma_int_t k, lddwork;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. );
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0 || m < n)
*info = -2;
else if (nrhs < 0)
*info = -3;
else if (ldda < max(1,m))
*info = -5;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
hwork[0] = c_one;
return *info;
}
lddwork= k;
/* B := Q' * B */
m, nrhs, n,
a_ref(0,0), ldda, tau,
dB, lddb, hwork, lwork, dT, nb, info );
if ( *info != 0 ) {
return *info;
}
/* Solve R*X = B(1:n,:)
1. Move the block diagonal submatrices from d_ref to R
2. Solve
3. Restore the data format moving data from R back to d_ref
*/
magmablas_sswapdblk(k, nb, a_ref(0,0), ldda, 1, d_ref(0), nb, 0);
if ( nrhs == 1 ) {
n, a_ref(0,0), ldda, dB, 1);
} else {
n, nrhs, c_one, a_ref(0,0), ldda, dB, lddb);
}
magmablas_sswapdblk(k, nb, d_ref(0), nb, 0, a_ref(0,0), ldda, 1);
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgeqrs_gpu ( magma_int_t  m,
magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
float *  tau,
float *  dT,
float *  dB,
magma_int_t  lddb,
float *  hwork,
magma_int_t  lhwork,
magma_int_t info 
)

Definition at line 14 of file sgeqrs_gpu.cpp.

References __func__, a_ref, blasf77_strsm(), blasf77_strsv(), d_ref, dwork, magma_get_sgeqrf_nb(), MAGMA_S_MAKE, MAGMA_S_NEG_ONE, MAGMA_S_ONE, MAGMA_S_ZERO, magma_scopymatrix(), magma_sgemm(), magma_sgemv(), magma_sormqr_gpu(), magma_ssetmatrix(), magma_xerbla(), MagmaLeft, MagmaLeftStr, MagmaNonUnitStr, MagmaNoTrans, MagmaNoTransStr, MagmaTrans, MagmaUpperStr, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves the least squares problem
min || A*X - C ||
using the QR factorization A = Q*R computed by SGEQRF_GPU.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. M >= N >= 0.
NRHS (input) INTEGER
The number of columns of the matrix C. NRHS >= 0.
A (input) REAL array on the GPU, dimension (LDDA,N)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,n, as returned by
SGEQRF_GPU in the first n columns of its array argument A.
LDDA (input) INTEGER
The leading dimension of the array A, LDDA >= M.
TAU (input) REAL array, dimension (N)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by MAGMA_SGEQRF_GPU.
DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS)
On entry, the M-by-NRHS matrix C.
On exit, the N-by-NRHS solution matrix X.
DT (input) REAL array that is the output (the 6th argument)
of magma_sgeqrf_gpu of size
2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
The array starts with a block of size MIN(M,N)*NB that stores
the triangular T matrices used in the QR factorization,
followed by MIN(M,N)*NB block storing the diagonal block
inverses for the R matrix, followed by work space of size
((N+31)/32*32 )* MAX(NB, NRHS).
LDDB (input) INTEGER
The leading dimension of the array DB. LDDB >= M.
HWORK (workspace/output) REAL array, dimension (LWORK)
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK, LWORK >= max(1,NRHS).
For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where
NB is the blocksize given by magma_get_sgeqrf_nb( M ).
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the HWORK array, returns
this value as the first entry of the WORK array.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
#define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
#define d_ref(a_1) (dT+(lddwork+(a_1))*nb)
float c_zero = MAGMA_S_ZERO;
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
float *dwork;
magma_int_t i, k, lddwork, rows, ib;
magma_int_t ione = 1;
magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb);
long int lquery = (lwork == -1);
hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. );
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0 || m < n)
*info = -2;
else if (nrhs < 0)
*info = -3;
else if (ldda < max(1,m))
*info = -5;
else if (lddb < max(1,m))
*info = -8;
else if (lwork < lwkopt && ! lquery)
*info = -10;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery)
return *info;
k = min(m,n);
if (k == 0) {
hwork[0] = c_one;
return *info;
}
/* B := Q' * B */
m, nrhs, n,
a_ref(0,0), ldda, tau,
dB, lddb, hwork, lwork, dT, nb, info );
if ( *info != 0 ) {
return *info;
}
/* Solve R*X = B(1:n,:) */
lddwork= k;
if (nb < k)
dwork = dT+2*lddwork*nb;
// To do: Why did we have this line originally; seems to be a bug (Stan)?
// dwork = dT;
i = (k-1)/nb * nb;
ib = n-i;
rows = m-i;
if ( nrhs == 1 ) {
&ib, hwork, &rows,
hwork+rows*ib, &ione);
} else {
&ib, &nrhs,
&c_one, hwork, &rows,
hwork+rows*ib, &rows);
}
// update the solution vector
magma_ssetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork );
// update c
if (nrhs == 1)
c_neg_one, a_ref(0, i), ldda,
dwork + i, 1,
c_one, dB, 1);
else
i, nrhs, ib,
c_neg_one, a_ref(0, i), ldda,
dwork + i, lddwork,
c_one, dB, lddb);
int start = i-nb;
if (nb < k) {
for (i = start; i >=0; i -= nb) {
ib = min(k-i, nb);
rows = m -i;
if (i + ib < n) {
if (nrhs == 1)
{
c_one, d_ref(i), ib,
dB+i, 1,
c_zero, dwork+i, 1);
c_neg_one, a_ref(0, i), ldda,
dwork + i, 1,
c_one, dB, 1);
}
else
{
ib, nrhs, ib,
c_one, d_ref(i), ib,
dB+i, lddb,
c_zero, dwork+i, lddwork);
i, nrhs, ib,
c_neg_one, a_ref(0, i), ldda,
dwork + i, lddwork,
c_one, dB, lddb);
}
}
}
}
magma_scopymatrix( (n), nrhs,
dwork, lddwork,
dB, lddb );
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgessm_gpu ( char  storev,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magma_int_t  ib,
magma_int_t ipiv,
float *  dL1,
magma_int_t  lddl1,
float *  dL,
magma_int_t  lddl,
float *  dA,
magma_int_t  ldda,
magma_int_t info 
)

Definition at line 21 of file sgessm_gpu.cpp.

References __func__, AT, dA, L, MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_sgemm, magma_strmm(), magma_strsm(), magma_xerbla(), magmablas_sgetmo_in, magmablas_slaswp(), MagmaLower, MagmaNoTrans, MagmaRight, MagmaTrans, MagmaUnit, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array on the GPU, dimension (LDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define AT(i,j) (dAT + (i)*ldda + (j) )
#define L(i,j) (dL + (i) + (j)*lddl )
#define dL1(j) (dL1 + (j)*lddl1)
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
int i, s, sb;
float *dAT;
/* Check arguments */
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (ldda < max(1,m))
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
if ( (storev == 'C') || (storev == 'c') ) {
magmablas_sgetmo_in( dA, dAT, ldda, m, n );
} else {
dAT = dA;
}
s = k / ib;
for(i = 0; i < k; i += ib) {
sb = min(ib, k-i);
magmablas_slaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 );
#ifndef WITHOUTTRTRI
n, sb,
c_one, dL1(i), lddl1,
AT(i, 0), ldda);
#else
n, sb,
c_one, L( i, i), lddl,
AT(i, 0), ldda);
#endif
if ( (i+sb) < m) {
n, m-(i+sb), sb,
c_neg_one, AT(i, 0), ldda,
L( i+sb, i), lddl,
c_one, AT(i+sb, 0), ldda );
}
}
if ( (storev == 'C') || (storev == 'c') ) {
magmablas_sgetmo_in( dA, dAT, ldda, m, n );
}
return *info;
/* End of MAGMA_SGETRF_GPU */
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgesv ( magma_int_t  n,
magma_int_t  nrhs,
float *  A,
magma_int_t  lda,
magma_int_t ipiv,
float *  B,
magma_int_t  ldb,
magma_int_t info 
)

Definition at line 14 of file sgesv.cpp.

References __func__, lapackf77_sgetrs(), magma_sgetrf(), magma_xerbla(), MagmaNoTransStr, and max.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves a system of linear equations
A * X = B
where A is a general N-by-N matrix and X and B are N-by-NRHS matrices.
The LU decomposition with partial pivoting and row interchanges is
used to factor A as
A = P * L * U,
where P is a permutation matrix, L is unit lower triangular, and U is
upper triangular. The factored form of A is then used to solve the
system of equations A * X = B.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input/output) REAL array on the GPU, dimension (LDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
B (input/output) REAL array on the GPU, dimension (LDB,NRHS)
On entry, the right hand side matrix B.
On exit, the solution matrix X.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
*info = 0;
if (n < 0) {
*info = -1;
} else if (nrhs < 0) {
*info = -2;
} else if (lda < max(1,n)) {
*info = -4;
} else if (ldb < max(1,n)) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (n == 0 || nrhs == 0) {
return *info;
}
magma_sgetrf( n, n, A, lda, ipiv, info );
if ( *info == 0 ) {
lapackf77_sgetrs( MagmaNoTransStr, &n, &nrhs, A, &lda, ipiv, B, &ldb, info );
}
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgesv_gpu ( magma_int_t  n,
magma_int_t  nrhs,
float *  dA,
magma_int_t  ldda,
magma_int_t ipiv,
float *  dB,
magma_int_t  lddb,
magma_int_t info 
)

Definition at line 21 of file sgesv_gpu.cpp.

References __func__, magma_sgetrf_gpu(), magma_sgetrs_gpu(), magma_xerbla(), MagmaNoTrans, and max.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
Solves a system of linear equations
A * X = B
where A is a general N-by-N matrix and X and B are N-by-NRHS matrices.
The LU decomposition with partial pivoting and row interchanges is
used to factor A as
A = P * L * U,
where P is a permutation matrix, L is unit lower triangular, and U is
upper triangular. The factored form of A is then used to solve the
system of equations A * X = B.
Arguments
=========
N (input) INTEGER
The order of the matrix A. N >= 0.
NRHS (input) INTEGER
The number of right hand sides, i.e., the number of columns
of the matrix B. NRHS >= 0.
A (input/output) REAL array on the GPU, dimension (LDDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
B (input/output) REAL array on the GPU, dimension (LDB,NRHS)
On entry, the right hand side matrix B.
On exit, the solution matrix X.
LDB (input) INTEGER
The leading dimension of the array B. LDB >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
*info = 0;
if (n < 0) {
*info = -1;
} else if (nrhs < 0) {
*info = -2;
} else if (ldda < max(1,n)) {
*info = -4;
} else if (lddb < max(1,n)) {
*info = -7;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (n == 0 || nrhs == 0) {
return *info;
}
magma_sgetrf_gpu( n, n, dA, ldda, ipiv, info );
if ( *info == 0 ) {
magma_sgetrs_gpu( MagmaNoTrans, n, nrhs, dA, ldda, ipiv, dB, lddb, info );
}
return *info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgesvd ( char  jobu,
char  jobvt,
magma_int_t  m,
magma_int_t  n,
float *  a,
magma_int_t  lda,
float *  s,
float *  u,
magma_int_t  ldu,
float *  vt,
magma_int_t  ldvt,
float *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 14 of file sgesvd.cpp.

References __func__, blasf77_sgemm(), lapackf77_sbdsqr(), lapackf77_sgelqf(), lapackf77_sgeqrf(), lapackf77_sgesvd(), lapackf77_slacpy(), lapackf77_slamch, lapackf77_slange(), lapackf77_slascl(), lapackf77_slaset(), lapackf77_sorgbr(), lapackf77_sorglq(), lapackf77_sorgqr(), lapackf77_sormbr(), magma_get_sgebrd_nb(), magma_sgebrd(), magma_ssqrt, magma_xerbla(), max, min, and codegen::work.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGESVD computes the singular value decomposition (SVD) of a complex
M-by-N matrix A, optionally computing the left and/or right singular
vectors. The SVD is written
A = U * SIGMA * conjugate-transpose(V)
where SIGMA is an M-by-N matrix which is zero except for its
min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
V is an N-by-N unitary matrix. The diagonal elements of SIGMA
are the singular values of A; they are real and non-negative, and
are returned in descending order. The first min(m,n) columns of
U and V are the left and right singular vectors of A.
Note that the routine returns V**H, not V.
Arguments
=========
JOBU (input) CHARACTER*1
Specifies options for computing all or part of the matrix U:
= 'A': all M columns of U are returned in array U:
= 'S': the first min(m,n) columns of U (the left singular
vectors) are returned in the array U;
= 'O': the first min(m,n) columns of U (the left singular
vectors) are overwritten on the array A;
= 'N': no columns of U (no left singular vectors) are
computed.
JOBVT (input) CHARACTER*1
Specifies options for computing all or part of the matrix
V**H:
= 'A': all N rows of V**H are returned in the array VT;
= 'S': the first min(m,n) rows of V**H (the right singular
vectors) are returned in the array VT;
= 'O': the first min(m,n) rows of V**H (the right singular
vectors) are overwritten on the array A;
= 'N': no rows of V**H (no right singular vectors) are
computed.
JOBVT and JOBU cannot both be 'O'.
M (input) INTEGER
The number of rows of the input matrix A. M >= 0.
N (input) INTEGER
The number of columns of the input matrix A. N >= 0.
A (input/output) DOUBLE PRECISION array, dimension (LDA,N)
On entry, the M-by-N matrix A.
On exit,
if JOBU = 'O', A is overwritten with the first min(m,n)
columns of U (the left singular vectors,
stored columnwise);
if JOBVT = 'O', A is overwritten with the first min(m,n)
rows of V**H (the right singular vectors,
stored rowwise);
if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A
are destroyed.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
S (output) DOUBLE PRECISION array, dimension (min(M,N))
The singular values of A, sorted so that S(i) >= S(i+1).
U (output) DOUBLE PRECISION array, dimension (LDU,UCOL)
(LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'.
If JOBU = 'A', U contains the M-by-M unitary matrix U;
if JOBU = 'S', U contains the first min(m,n) columns of U
(the left singular vectors, stored columnwise);
if JOBU = 'N' or 'O', U is not referenced.
LDU (input) INTEGER
The leading dimension of the array U. LDU >= 1; if
JOBU = 'S' or 'A', LDU >= M.
VT (output) DOUBLE PRECISION array, dimension (LDVT,N)
If JOBVT = 'A', VT contains the N-by-N unitary matrix
V**H;
if JOBVT = 'S', VT contains the first min(m,n) rows of
V**H (the right singular vectors, stored rowwise);
if JOBVT = 'N' or 'O', VT is not referenced.
LDVT (input) INTEGER
The leading dimension of the array VT. LDVT >= 1; if
JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N).
WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
if INFO > 0, WORK(2:MIN(M,N)) contains the unconverged
superdiagonal elements of an upper bidiagonal matrix B
whose diagonal is in S (not necessarily sorted). B
satisfies A = U * B * VT, so it has the same singular values
as A, and singular vectors related by U and VT.
LWORK (input) INTEGER
The dimension of the array WORK.
LWORK >= (M+N)*nb+N.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit.
< 0: if INFO = -i, the i-th argument had an illegal value.
> 0: if DBDSQR did not converge, INFO specifies how many
superdiagonals of an intermediate bidiagonal form B
did not converge to zero. See the description of RWORK
above for details.
===================================================================== */
/* Table of constant values */
char jobu_[2] = {jobu, 0};
char jobvt_[2] = {jobvt, 0};
magma_int_t *m = &m_;
magma_int_t *n = &n_;
magma_int_t *lda = &lda_;
magma_int_t *ldu = &ldu_;
magma_int_t *ldvt = &ldvt_;
magma_int_t *lwork = &lwork_;
static magma_int_t c__0 = 0;
static magma_int_t c__1 = 1;
static magma_int_t c_n1 = -1;
static float c_b421 = 0.;
static float c_b443 = 1.;
/* System generated locals */
magma_int_t a_dim1, a_offset, u_dim1, u_offset, vt_dim1, vt_offset,
i__2, i__3, i__4;
/* Local variables */
magma_int_t i__, ie, ir, iu, blk, ncu;
float dum[1], eps;
magma_int_t nru, iscl;
float anrm;
magma_int_t ierr, itau, ncvt, nrvt;
magma_int_t chunk, minmn, wrkbl, itaup, itauq, mnthr, iwork;
magma_int_t wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs;
magma_int_t bdspac;
float bignum;
magma_int_t ldwrkr, ldwrku, maxwrk, minwrk;
float smlnum;
magma_int_t lquery, wntuas, wntvas;
/* Function Body */
*info = 0;
mnthr = (magma_int_t)( (float)(min( m_, n_ )) * 1.6 );
bdspac = 5*n_;
minmn = min(*m,*n);
wntua = lsame_(jobu_, "A");
wntus = lsame_(jobu_, "S");
wntuas = wntua || wntus;
wntuo = lsame_(jobu_, "O");
wntun = lsame_(jobu_, "N");
wntva = lsame_(jobvt_, "A");
wntvs = lsame_(jobvt_, "S");
wntvas = wntva || wntvs;
wntvo = lsame_(jobvt_, "O");
wntvn = lsame_(jobvt_, "N");
lquery = *lwork == -1;
/* Test the input arguments */
if (! (wntua || wntus || wntuo || wntun)) {
*info = -1;
} else if (! (wntva || wntvs || wntvo || wntvn) || (wntvo && wntuo) ) {
*info = -2;
} else if (*m < 0) {
*info = -3;
} else if (*n < 0) {
*info = -4;
} else if (*lda < max(1,*m)) {
*info = -6;
} else if ((*ldu < 1) || (wntuas && (*ldu < *m)) ) {
*info = -9;
} else if ((*ldvt < 1) || (wntva && (*ldvt < *n)) || (wntvs && (*ldvt < minmn)) ) {
*info = -11;
}
/* Compute workspace */
lapackf77_sgesvd(jobu_, jobvt_, m, n, a, lda, s, u, ldu,
vt, ldvt, work, &c_n1, info );
maxwrk = (magma_int_t)work[0];
if (*info == 0) {
/* Return optimal workspace in WORK(1) */
minwrk = ((*m)+(*n))*nb+(*n);
work[0] = (float)minwrk;
if ( !lquery && (lwork_ < minwrk) ) {
*info = -13;
}
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (*m == 0 || *n == 0) {
return *info;
}
wrkbl = maxwrk; /* Not optimal */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--s;
u_dim1 = *ldu;
u_offset = 1 + u_dim1;
u -= u_offset;
vt_dim1 = *ldvt;
vt_offset = 1 + vt_dim1;
vt -= vt_offset;
--work;
/* Get machine constants */
eps = lapackf77_slamch("P");
smlnum = magma_ssqrt(lapackf77_slamch("S")) / eps;
bignum = 1. / smlnum;
/* Scale A if max element outside range [SMLNUM,BIGNUM] */
anrm = lapackf77_slange("M", m, n, &a[a_offset], lda, dum);
iscl = 0;
if (anrm > 0. && anrm < smlnum) {
iscl = 1;
lapackf77_slascl("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, &
ierr);
} else if (anrm > bignum) {
iscl = 1;
lapackf77_slascl("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, &
ierr);
}
if (*m >= *n) {
/* A has at least as many rows as columns. If A has sufficiently */
/* more rows than columns, first reduce using the QR */
/* decomposition (if sufficient workspace available) */
if (*m >= mnthr) {
if (wntun) {
/* Path 1 (M much larger than N, JOBU='N') */
/* No left singular vectors to be computed */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork], &
i__2, &ierr);
/* Zero out below R */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &a[a_dim1 + 2],
lda);
ie = 1;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in A */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
ncvt = 0;
if (wntvo || wntvas) {
/* If right singular vectors desired, generate P'. */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &a[a_offset], lda, &work[itaup], &
work[iwork], &i__2, &ierr);
ncvt = *n;
}
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of A in A if desired */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &ncvt, &c__0, &c__0, &s[1], &work[ie],
&a[a_offset], lda, dum, &c__1, dum, &c__1, &work[iwork],
info);
/* If right singular vectors desired in VT, copy them there */
if (wntvas) {
lapackf77_slacpy("F", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
}
} else if (wntuo && wntvn) {
/* Path 2 (M much larger than N, JOBU='O', JOBVT='N') */
/* N left singular vectors to be overwritten on A and */
/* no right singular vectors to be computed */
/* Computing MAX */
i__2 = *n << 2;
if (*lwork >= *n * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n + *n;
if (*lwork >= max(i__2,i__3) + *lda * *n) {
/* WORK(IU) is LDA by N, WORK(IR) is LDA by N */
ldwrku = *lda;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n + *n;
if (*lwork >= max(i__2,i__3) + *n * *n) {
/* WORK(IU) is LDA by N, WORK(IR) is N by N */
ldwrku = *lda;
ldwrkr = *n;
} else {
/* WORK(IU) is LDWRKU by N, WORK(IR) is N by N */
ldwrku = (*lwork - *n * *n - *n) / *n;
ldwrkr = *n;
}
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy R to WORK(IR) and zero out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[ir], &ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[ir + 1]
, &ldwrkr);
/* Generate Q in A */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[ir], ldwrkr, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Generate left vectors bidiagonalizing R */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq], &
work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IR) */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, n, &c__0, &s[1], &work[ie], dum, &
c__1, &work[ir], &ldwrkr, dum, &c__1, &work[iwork]
, info);
iu = ie + *n;
/* Multiply Q in A by left singular vectors of R in */
/* WORK(IR), storing result in WORK(IU) and copying to A */
/* (Workspace: need N*N+2*N, prefer N*N+M*N+N) */
i__2 = *m;
i__3 = ldwrku;
for (i__ = 1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
i__3) {
/* Computing MIN */
i__4 = *m - i__ + 1;
chunk = min(i__4,ldwrku);
blasf77_sgemm("N", "N", &chunk, n, n, &c_b443, &a[i__ +
a_dim1], lda, &work[ir], &ldwrkr, &c_b421, &
work[iu], &ldwrku);
lapackf77_slacpy("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
a_dim1], lda);
/* L10: */
}
} else {
/* Insufficient workspace for a fast algorithm */
ie = 1;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize A */
/* (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB) */
i__3 = *lwork - iwork + 1;
magma_sgebrd(*m, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__3, &ierr);
/* Generate left vectors bidiagonalizing A */
/* (Workspace: need 4*N, prefer 3*N+N*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, n, n, &a[a_offset], lda, &work[itauq], &
work[iwork], &i__3, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, m, &c__0, &s[1], &work[ie], dum, &
c__1, &a[a_offset], lda, dum, &c__1, &work[iwork],
info);
}
} else if (wntuo && wntvas) {
/* Path 3 (M much larger than N, JOBU='O', JOBVT='S' or 'A') */
/* N left singular vectors to be overwritten on A and */
/* N right singular vectors to be computed in VT */
/* Computing MAX */
i__3 = *n << 2;
if (*lwork >= *n * *n + max(i__3,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n + *n;
if (*lwork >= max(i__3,i__2) + *lda * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n + *n;
if (*lwork >= max(i__3,i__2) + *n * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ldwrkr = *n;
} else {
/* WORK(IU) is LDWRKU by N and WORK(IR) is N by N */
ldwrku = (*lwork - *n * *n - *n) / *n;
ldwrkr = *n;
}
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__3, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__3 = *n - 1;
i__2 = *n - 1;
lapackf77_slaset("L", &i__3, &i__2, &c_b421, &c_b421, &vt[
vt_dim1 + 2], ldvt);
}
/* Generate Q in A */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__3, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT, copying result to WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__3 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__3, &ierr);
lapackf77_slacpy("L", n, n, &vt[vt_offset], ldvt, &work[ir], &
ldwrkr);
/* Generate left vectors bidiagonalizing R in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq], &
work[iwork], &i__3, &ierr);
/* Generate right vectors bidiagonalizing R in VT */
/* (Workspace: need N*N+4*N-1, prefer N*N+3*N+(N-1)*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__3, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IR) and computing right */
/* singular vectors of R in VT */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, n, n, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &work[ir], &ldwrkr, dum, &c__1,
&work[iwork], info);
iu = ie + *n;
/* Multiply Q in A by left singular vectors of R in */
/* WORK(IR), storing result in WORK(IU) and copying to A */
/* (Workspace: need N*N+2*N, prefer N*N+M*N+N) */
i__3 = *m;
i__2 = ldwrku;
for (i__ = 1; i__2 < 0 ? i__ >= i__3 : i__ <= i__3; i__ +=
i__2) {
/* Computing MIN */
i__4 = *m - i__ + 1;
chunk = min(i__4,ldwrku);
blasf77_sgemm("N", "N", &chunk, n, n, &c_b443, &a[i__ +
a_dim1], lda, &work[ir], &ldwrkr, &c_b421, &
work[iu], &ldwrku);
lapackf77_slacpy("F", &chunk, n, &work[iu], &ldwrku, &a[i__ +
a_dim1], lda);
/* L20: */
}
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &vt[
vt_dim1 + 2], ldvt);
}
/* Generate Q in A */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in A by left vectors bidiagonalizing R */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt, &
work[itauq], &a[a_offset], lda, &work[iwork], &
i__2, &ierr);
/* Generate right vectors bidiagonalizing R in VT */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in A and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, dum, &c__1, &
work[iwork], info);
}
} else if (wntus) {
if (wntvn) {
/* Path 4 (M much larger than N, JOBU='S', JOBVT='N') */
/* N left singular vectors to be computed in U and */
/* no right singular vectors to be computed */
/* Computing MAX */
i__2 = *n << 2;
if (*lwork >= *n * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IR) is LDA by N */
ldwrkr = *lda;
} else {
/* WORK(IR) is N by N */
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy R to WORK(IR), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[ir
+ 1], &ldwrkr);
/* Generate Q in A */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[ir], ldwrkr, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Generate left vectors bidiagonalizing R in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IR) */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, n, &c__0, &s[1], &work[ie],
dum, &c__1, &work[ir], &ldwrkr, dum, &c__1, &
work[iwork], info);
/* Multiply Q in A by left singular vectors of R in */
/* WORK(IR), storing result in U */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &a[a_offset], lda,
&work[ir], &ldwrkr, &c_b421, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &a[
a_dim1 + 2], lda);
/* Bidiagonalize R in A */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left vectors bidiagonalizing R */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, m, &c__0, &s[1], &work[ie],
dum, &c__1, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
}
} else if (wntvo) {
/* Path 5 (M much larger than N, JOBU='S', JOBVT='O') */
/* N left singular vectors to be computed in U and */
/* N right singular vectors to be overwritten on A */
/* Computing MAX */
i__2 = *n << 2;
if (*lwork >= (*n << 1) * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *n) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *n;
} else {
/* WORK(IU) is N by N and WORK(IR) is N by N */
ldwrku = *n;
ir = iu + ldwrku * *n;
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ 1], &ldwrku);
/* Generate Q in A */
/* (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to */
/* WORK(IR) */
/* (Workspace: need 2*N*N+4*N, */
/* prefer 2*N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[iu], ldwrku, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
lapackf77_slacpy("U", n, n, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate left bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need 2*N*N+4*N-1, */
/* prefer 2*N*N+3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IU) and computing */
/* right singular vectors of R in WORK(IR) */
/* (Workspace: need 2*N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, n, n, &c__0, &s[1], &work[ie], &work[
ir], &ldwrkr, &work[iu], &ldwrku, dum, &c__1,
&work[iwork], info);
/* Multiply Q in A by left singular vectors of R in */
/* WORK(IU), storing result in U */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &a[a_offset], lda,
&work[iu], &ldwrku, &c_b421, &u[u_offset],
ldu);
/* Copy right singular vectors of R to A */
/* (Workspace: need N*N) */
lapackf77_slacpy("F", n, n, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &a[
a_dim1 + 2], lda);
/* Bidiagonalize R in A */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left vectors bidiagonalizing R */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
/* Generate right vectors bidiagonalizing R in A */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &a[a_offset], lda, &work[itaup],
&work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, n, m, &c__0, &s[1], &work[ie], &a[
a_offset], lda, &u[u_offset], ldu, dum, &c__1,
&work[iwork], info);
}
} else if (wntvas) {
/* Path 6 (M much larger than N, JOBU='S', JOBVT='S' */
/* or 'A') */
/* N left singular vectors to be computed in U and */
/* N right singular vectors to be computed in VT */
/* Computing MAX */
i__2 = *n << 2;
if (*lwork >= *n * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is N by N */
ldwrku = *n;
}
itau = iu + ldwrku * *n;
iwork = itau + *n;
/* Compute A=Q*R */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ 1], &ldwrku);
/* Generate Q in A */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to VT */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[iu], ldwrku, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
lapackf77_slacpy("U", n, n, &work[iu], &ldwrku, &vt[vt_offset],
ldvt);
/* Generate left bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT */
/* (Workspace: need N*N+4*N-1, */
/* prefer N*N+3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IU) and computing */
/* right singular vectors of R in VT */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, n, n, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &work[iu], &ldwrku, dum, &
c__1, &work[iwork], info);
/* Multiply Q in A by left singular vectors of R in */
/* WORK(IU), storing result in U */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &a[a_offset], lda,
&work[iu], &ldwrku, &c_b421, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, n, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to VT, zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &vt[
vt_dim1 + 2], ldvt);
}
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors */
/* in VT */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt,
&work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &
c__1, &work[iwork], info);
}
}
} else if (wntua) {
if (wntvn) {
/* Path 7 (M much larger than N, JOBU='A', JOBVT='N') */
/* M left singular vectors to be computed in U and */
/* no right singular vectors to be computed */
/* Computing MAX */
i__2 = *n + *m, i__3 = *n << 2, i__2 = max(i__2,i__3);
if (*lwork >= *n * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IR) is LDA by N */
ldwrkr = *lda;
} else {
/* WORK(IR) is N by N */
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Copy R to WORK(IR), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[ir
+ 1], &ldwrkr);
/* Generate Q in U */
/* (Workspace: need N*N+N+M, prefer N*N+N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[ir], ldwrkr, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IR) */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, n, &c__0, &s[1], &work[ie],
dum, &c__1, &work[ir], &ldwrkr, dum, &c__1, &
work[iwork], info);
/* Multiply Q in U by left singular vectors of R in */
/* WORK(IR), storing result in A */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &u[u_offset], ldu,
&work[ir], &ldwrkr, &c_b421, &a[a_offset],
lda);
/* Copy left singular vectors of A from A to U */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need N+M, prefer N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &a[
a_dim1 + 2], lda);
/* Bidiagonalize R in A */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors */
/* in A */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &c__0, m, &c__0, &s[1], &work[ie],
dum, &c__1, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
}
} else if (wntvo) {
/* Path 8 (M much larger than N, JOBU='A', JOBVT='O') */
/* M left singular vectors to be computed in U and */
/* N right singular vectors to be overwritten on A */
/* Computing MAX */
i__2 = *n + *m, i__3 = *n << 2, i__2 = max(i__2,i__3);
if (*lwork >= (*n << 1) * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *n) * *n) {
/* WORK(IU) is LDA by N and WORK(IR) is N by N */
ldwrku = *lda;
ir = iu + ldwrku * *n;
ldwrkr = *n;
} else {
/* WORK(IU) is N by N and WORK(IR) is N by N */
ldwrku = *n;
ir = iu + ldwrku * *n;
ldwrkr = *n;
}
itau = ir + ldwrkr * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need 2*N*N+N+M, prefer 2*N*N+N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ 1], &ldwrku);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to */
/* WORK(IR) */
/* (Workspace: need 2*N*N+4*N, */
/* prefer 2*N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[iu], ldwrku, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
lapackf77_slacpy("U", n, n, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate left bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need 2*N*N+4*N-1, */
/* prefer 2*N*N+3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IU) and computing */
/* right singular vectors of R in WORK(IR) */
/* (Workspace: need 2*N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, n, n, &c__0, &s[1], &work[ie], &work[
ir], &ldwrkr, &work[iu], &ldwrku, dum, &c__1,
&work[iwork], info);
/* Multiply Q in U by left singular vectors of R in */
/* WORK(IU), storing result in A */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &u[u_offset], ldu,
&work[iu], &ldwrku, &c_b421, &a[a_offset],
lda);
/* Copy left singular vectors of A from A to U */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Copy right singular vectors of R from WORK(IR) to A */
lapackf77_slacpy("F", n, n, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need N+M, prefer N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Zero out below R in A */
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &a[
a_dim1 + 2], lda);
/* Bidiagonalize R in A */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors */
/* in A */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &a[a_offset], lda, &
work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr)
;
/* Generate right bidiagonalizing vectors in A */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &a[a_offset], lda, &work[itaup],
&work[iwork], &i__2, &ierr);
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, n, m, &c__0, &s[1], &work[ie], &a[
a_offset], lda, &u[u_offset], ldu, dum, &c__1,
&work[iwork], info);
}
} else if (wntvas) {
/* Path 9 (M much larger than N, JOBU='A', JOBVT='S' */
/* or 'A') */
/* M left singular vectors to be computed in U and */
/* N right singular vectors to be computed in VT */
/* Computing MAX */
i__2 = *n + *m, i__3 = *n << 2, i__2 = max(i__2,i__3);
if (*lwork >= *n * *n + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *n) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is N by N */
ldwrku = *n;
}
itau = iu + ldwrku * *n;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need N*N+2*N, prefer N*N+N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need N*N+N+M, prefer N*N+N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R to WORK(IU), zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ 1], &ldwrku);
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in WORK(IU), copying result to VT */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &work[iu], ldwrku, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
lapackf77_slacpy("U", n, n, &work[iu], &ldwrku, &vt[vt_offset],
ldvt);
/* Generate left bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", n, n, n, &work[iu], &ldwrku, &work[itauq]
, &work[iwork], &i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT */
/* (Workspace: need N*N+4*N-1, */
/* prefer N*N+3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of R in WORK(IU) and computing */
/* right singular vectors of R in VT */
/* (Workspace: need N*N+BDSPAC) */
lapackf77_sbdsqr("U", n, n, n, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &work[iu], &ldwrku, dum, &
c__1, &work[iwork], info);
/* Multiply Q in U by left singular vectors of R in */
/* WORK(IU), storing result in A */
/* (Workspace: need N*N) */
blasf77_sgemm("N", "N", m, n, n, &c_b443, &u[u_offset], ldu,
&work[iu], &ldwrku, &c_b421, &a[a_offset],
lda);
/* Copy left singular vectors of A from A to U */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *n;
/* Compute A=Q*R, copying result to U */
/* (Workspace: need 2*N, prefer N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgeqrf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset],
ldu);
/* Generate Q in U */
/* (Workspace: need N+M, prefer N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgqr(m, m, n, &u[u_offset], ldu, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy R from A to VT, zeroing out below it */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
if (*n > 1) {
i__2 = *n - 1;
i__3 = *n - 1;
lapackf77_slaset("L", &i__2, &i__3, &c_b421, &c_b421, &vt[
vt_dim1 + 2], ldvt);
}
ie = itau;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize R in VT */
/* (Workspace: need 4*N, prefer 3*N+2*N*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*n, *n, &vt[vt_offset], *ldvt, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
/* Multiply Q in U by left bidiagonalizing vectors */
/* in VT */
/* (Workspace: need 3*N+M, prefer 3*N+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("Q", "R", "N", m, n, n, &vt[vt_offset], ldvt,
&work[itauq], &u[u_offset], ldu, &work[iwork],
&i__2, &ierr);
/* Generate right bidiagonalizing vectors in VT */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[
itaup], &work[iwork], &i__2, &ierr)
;
iwork = ie + *n;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &
c__1, &work[iwork], info);
}
}
}
} else {
/* M .LT. MNTHR */
/* Path 10 (M at least N, but not much larger) */
/* Reduce to bidiagonal form without QR decomposition */
ie = 1;
itauq = ie + *n;
itaup = itauq + *n;
iwork = itaup + *n;
/* Bidiagonalize A */
/* (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *n, &a[a_offset], *lda, &s[1],
&work[ie], &work[itauq], &work[itaup],
&work[iwork], i__2, &ierr);
if (wntuas) {
/* If left singular vectors desired in U, copy result to U */
/* and generate left bidiagonalizing vectors in U */
/* (Workspace: need 3*N+NCU, prefer 3*N+NCU*NB) */
lapackf77_slacpy("L", m, n, &a[a_offset], lda, &u[u_offset], ldu);
if (wntus) {
ncu = *n;
}
if (wntua) {
ncu = *m;
}
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, &ncu, n, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__2, &ierr);
}
if (wntvas) {
/* If right singular vectors desired in VT, copy result to */
/* VT and generate right bidiagonalizing vectors in VT */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
lapackf77_slacpy("U", n, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &vt[vt_offset], ldvt, &work[itaup], &
work[iwork], &i__2, &ierr);
}
if (wntuo) {
/* If left singular vectors desired in A, generate left */
/* bidiagonalizing vectors in A */
/* (Workspace: need 4*N, prefer 3*N+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, n, n, &a[a_offset], lda, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvo) {
/* If right singular vectors desired in A, generate right */
/* bidiagonalizing vectors in A */
/* (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", n, n, n, &a[a_offset], lda, &work[itaup], &work[
iwork], &i__2, &ierr);
}
iwork = ie + *n;
if (wntuas || wntuo) {
nru = *m;
}
if (wntun) {
nru = 0;
}
if (wntvas || wntvo) {
ncvt = *n;
}
if (wntvn) {
ncvt = 0;
}
if (! wntuo && ! wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in U and computing right singular */
/* vectors in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
} else if (! wntuo && wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in U and computing right singular */
/* vectors in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &work[ie], &a[
a_offset], lda, &u[u_offset], ldu, dum, &c__1, &work[
iwork], info);
} else {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in A and computing right singular */
/* vectors in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", n, &ncvt, &nru, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, dum, &c__1, &
work[iwork], info);
}
}
} else {
/* A has more columns than rows. If A has sufficiently more */
/* columns than rows, first reduce using the LQ decomposition (if */
/* sufficient workspace available) */
if (*n >= mnthr) {
if (wntvn) {
/* Path 1t(N much larger than M, JOBVT='N') */
/* No right singular vectors to be computed */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork], &
i__2, &ierr);
/* Zero out above L */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &a[(a_dim1 << 1)
+ 1], lda);
ie = 1;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in A */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &a[a_offset], *lda, &s[1], &work[ie], &work[
itauq], &work[itaup], &work[iwork], i__2, &ierr);
if (wntuo || wntuas) {
/* If left singular vectors desired, generate Q */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &a[a_offset], lda, &work[itauq], &
work[iwork], &i__2, &ierr);
}
iwork = ie + *m;
nru = 0;
if (wntuo || wntuas) {
nru = *m;
}
/* Perform bidiagonal QR iteration, computing left singular */
/* vectors of A in A if desired */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, &c__0, &nru, &c__0, &s[1], &work[ie], dum, &
c__1, &a[a_offset], lda, dum, &c__1, &work[iwork],
info);
/* If left singular vectors desired in U, copy them there */
if (wntuas) {
lapackf77_slacpy("F", m, m, &a[a_offset], lda, &u[u_offset], ldu);
}
} else if (wntvo && wntun) {
/* Path 2t(N much larger than M, JOBU='N', JOBVT='O') */
/* M right singular vectors to be overwritten on A and */
/* no left singular vectors to be computed */
/* Computing MAX */
i__2 = *m << 2;
if (*lwork >= *m * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n + *m;
if (*lwork >= max(i__2,i__3) + *lda * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__2 = wrkbl, i__3 = *lda * *n + *m;
if (*lwork >= max(i__2,i__3) + *m * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is M by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *m;
} else {
/* WORK(IU) is M by CHUNK and WORK(IR) is M by M */
ldwrku = *m;
chunk = (*lwork - *m * *m - *m) / *m;
ldwrkr = *m;
}
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy L to WORK(IR) and zero out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[ir], &ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[ir +
ldwrkr], &ldwrkr);
/* Generate Q in A */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &work[ie], &work[
itauq], &work[itaup], &work[iwork], i__2, &ierr);
/* Generate right vectors bidiagonalizing L */
/* (Workspace: need M*M+4*M-1, prefer M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup], &
work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of L in WORK(IR) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, &c__0, &c__0, &s[1], &work[ie], &work[
ir], &ldwrkr, dum, &c__1, dum, &c__1, &work[iwork]
, info);
iu = ie + *m;
/* Multiply right singular vectors of L in WORK(IR) by Q */
/* in A, storing result in WORK(IU) and copying to A */
/* (Workspace: need M*M+2*M, prefer M*M+M*N+M) */
i__2 = *n;
i__3 = chunk;
for (i__ = 1; i__3 < 0 ? i__ >= i__2 : i__ <= i__2; i__ +=
i__3) {
/* Computing MIN */
i__4 = *n - i__ + 1;
blk = min(i__4,chunk);
blasf77_sgemm("N", "N", m, &blk, m, &c_b443, &work[ir], &
ldwrkr, &a[i__ * a_dim1 + 1], lda, &c_b421, &
work[iu], &ldwrku);
lapackf77_slacpy("F", m, &blk, &work[iu], &ldwrku, &a[i__ *
a_dim1 + 1], lda);
/* L30: */
}
} else {
/* Insufficient workspace for a fast algorithm */
ie = 1;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize A */
/* (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB) */
i__3 = *lwork - iwork + 1;
magma_sgebrd(*m, *n, &a[a_offset], *lda, &s[1], &work[ie], &work[
itauq], &work[itaup], &work[iwork], i__3, &ierr);
/* Generate right vectors bidiagonalizing A */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, n, m, &a[a_offset], lda, &work[itaup], &
work[iwork], &i__3, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of A in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("L", m, n, &c__0, &c__0, &s[1], &work[ie], &a[
a_offset], lda, dum, &c__1, dum, &c__1, &work[
iwork], info);
}
} else if (wntvo && wntuas) {
/* Path 3t(N much larger than M, JOBU='S' or 'A', JOBVT='O') */
/* M right singular vectors to be overwritten on A and */
/* M left singular vectors to be computed in U */
/* Computing MAX */
i__3 = *m << 2;
if (*lwork >= *m * *m + max(i__3,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n + *m;
if (*lwork >= max(i__3,i__2) + *lda * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is LDA by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *lda;
} else /* if(complicated condition) */ {
/* Computing MAX */
i__3 = wrkbl, i__2 = *lda * *n + *m;
if (*lwork >= max(i__3,i__2) + *m * *m) {
/* WORK(IU) is LDA by N and WORK(IR) is M by M */
ldwrku = *lda;
chunk = *n;
ldwrkr = *m;
} else {
/* WORK(IU) is M by CHUNK and WORK(IR) is M by M */
ldwrku = *m;
chunk = (*lwork - *m * *m - *m) / *m;
ldwrkr = *m;
}
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__3, &ierr);
/* Copy L to U, zeroing about above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__3 = *m - 1;
i__2 = *m - 1;
lapackf77_slaset("U", &i__3, &i__2, &c_b421, &c_b421, &u[(u_dim1 <<
1) + 1], ldu);
/* Generate Q in A */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &work[
iwork], &i__3, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U, copying result to WORK(IR) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__3 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &work[ie], &work[
itauq], &work[itaup], &work[iwork], i__3, &ierr);
lapackf77_slacpy("U", m, m, &u[u_offset], ldu, &work[ir], &ldwrkr);
/* Generate right vectors bidiagonalizing L in WORK(IR) */
/* (Workspace: need M*M+4*M-1, prefer M*M+3*M+(M-1)*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup], &
work[iwork], &i__3, &ierr);
/* Generate left vectors bidiagonalizing L in U */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+M*NB) */
i__3 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__3, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of L in U, and computing right */
/* singular vectors of L in WORK(IR) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, m, &c__0, &s[1], &work[ie], &work[ir],
&ldwrkr, &u[u_offset], ldu, dum, &c__1, &work[
iwork], info);
iu = ie + *m;
/* Multiply right singular vectors of L in WORK(IR) by Q */
/* in A, storing result in WORK(IU) and copying to A */
/* (Workspace: need M*M+2*M, prefer M*M+M*N+M)) */
i__3 = *n;
i__2 = chunk;
for (i__ = 1; i__2 < 0 ? i__ >= i__3 : i__ <= i__3; i__ +=
i__2) {
/* Computing MIN */
i__4 = *n - i__ + 1;
blk = min(i__4,chunk);
blasf77_sgemm("N", "N", m, &blk, m, &c_b443, &work[ir], &
ldwrkr, &a[i__ * a_dim1 + 1], lda, &c_b421, &
work[iu], &ldwrku);
lapackf77_slacpy("F", m, &blk, &work[iu], &ldwrku, &a[i__ *
a_dim1 + 1], lda);
/* L40: */
}
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[iwork]
, &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &u[(u_dim1 <<
1) + 1], ldu);
/* Generate Q in A */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &work[ie], &work[
itauq], &work[itaup], &work[iwork], i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in A */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &u[u_offset], ldu, &work[
itaup], &a[a_offset], lda, &work[iwork], &i__2, &
ierr);
/* Generate left vectors bidiagonalizing L in U */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq], &
work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, m, &c__0, &s[1], &work[ie], &a[
a_offset], lda, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
}
} else if (wntvs) {
if (wntun) {
/* Path 4t(N much larger than M, JOBU='N', JOBVT='S') */
/* M right singular vectors to be computed in VT and */
/* no left singular vectors to be computed */
/* Computing MAX */
i__2 = *m << 2;
if (*lwork >= *m * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IR) is LDA by M */
ldwrkr = *lda;
} else {
/* WORK(IR) is M by M */
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IR), zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[ir
+ ldwrkr], &ldwrkr);
/* Generate Q in A */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate right vectors bidiagonalizing L in */
/* WORK(IR) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of L in WORK(IR) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, &c__0, &c__0, &s[1], &work[ie], &
work[ir], &ldwrkr, dum, &c__1, dum, &c__1, &
work[iwork], info);
/* Multiply right singular vectors of L in WORK(IR) by */
/* Q in A, storing result in VT */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[ir], &ldwrkr,
&a[a_offset], lda, &c_b421, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy result to VT */
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &a[(
a_dim1 << 1) + 1], lda);
/* Bidiagonalize L in A */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &a[a_offset], *lda, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, &c__0, &c__0, &s[1], &work[ie], &
vt[vt_offset], ldvt, dum, &c__1, dum, &c__1, &
work[iwork], info);
}
} else if (wntuo) {
/* Path 5t(N much larger than M, JOBU='O', JOBVT='S') */
/* M right singular vectors to be computed in VT and */
/* M left singular vectors to be overwritten on A */
/* Computing MAX */
i__2 = *m << 2;
if (*lwork >= (*m << 1) * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is LDA by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *m) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is M by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *m;
} else {
/* WORK(IU) is M by M and WORK(IR) is M by M */
ldwrku = *m;
ir = iu + ldwrku * *m;
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out below it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ ldwrku], &ldwrku);
/* Generate Q in A */
/* (Workspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to */
/* WORK(IR) */
/* (Workspace: need 2*M*M+4*M, */
/* prefer 2*M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[iu], ldwrku, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_slacpy("L", m, m, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate right bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need 2*M*M+4*M-1, */
/* prefer 2*M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need 2*M*M+4*M, prefer 2*M*M+3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of L in WORK(IR) and computing */
/* right singular vectors of L in WORK(IU) */
/* (Workspace: need 2*M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, m, &c__0, &s[1], &work[ie], &work[
iu], &ldwrku, &work[ir], &ldwrkr, dum, &c__1,
&work[iwork], info);
/* Multiply right singular vectors of L in WORK(IU) by */
/* Q in A, storing result in VT */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[iu], &ldwrku,
&a[a_offset], lda, &c_b421, &vt[vt_offset],
ldvt);
/* Copy left singular vectors of L to A */
/* (Workspace: need M*M) */
lapackf77_slacpy("F", m, m, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &a[(
a_dim1 << 1) + 1], lda);
/* Bidiagonalize L in A */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &a[a_offset], *lda, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right vectors bidiagonalizing L by Q in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors of L in A */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &a[a_offset], lda, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, compute left */
/* singular vectors of A in A and compute right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, dum, &
c__1, &work[iwork], info);
}
} else if (wntuas) {
/* Path 6t(N much larger than M, JOBU='S' or 'A', */
/* JOBVT='S') */
/* M right singular vectors to be computed in VT and */
/* M left singular vectors to be computed in U */
/* Computing MAX */
i__2 = *m << 2;
if (*lwork >= *m * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IU) is LDA by N */
ldwrku = *lda;
} else {
/* WORK(IU) is LDA by M */
ldwrku = *m;
}
itau = iu + ldwrku * *m;
iwork = itau + *m;
/* Compute A=L*Q */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ ldwrku], &ldwrku);
/* Generate Q in A */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &a[a_offset], lda, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to U */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[iu], ldwrku, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_slacpy("L", m, m, &work[iu], &ldwrku, &u[u_offset],
ldu);
/* Generate right bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need M*M+4*M-1, */
/* prefer M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of L in U and computing right */
/* singular vectors of L in WORK(IU) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, m, &c__0, &s[1], &work[ie], &work[
iu], &ldwrku, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
/* Multiply right singular vectors of L in WORK(IU) by */
/* Q in A, storing result in VT */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[iu], &ldwrku,
&a[a_offset], lda, &c_b421, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(m, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &u[u_offset],
ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &u[(
u_dim1 << 1) + 1], ldu);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in U by Q */
/* in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &u[u_offset], ldu, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &
c__1, &work[iwork], info);
}
}
} else if (wntva) {
if (wntun) {
/* Path 7t(N much larger than M, JOBU='N', JOBVT='A') */
/* N right singular vectors to be computed in VT and */
/* no left singular vectors to be computed */
/* Computing MAX */
i__2 = *n + *m, i__3 = *m << 2, i__2 = max(i__2,i__3);
if (*lwork >= *m * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
ir = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IR) is LDA by M */
ldwrkr = *lda;
} else {
/* WORK(IR) is M by M */
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Copy L to WORK(IR), zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[ir], &
ldwrkr);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[ir
+ ldwrkr], &ldwrkr);
/* Generate Q in VT */
/* (Workspace: need M*M+M+N, prefer M*M+M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IR) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[ir], ldwrkr, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Generate right bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need M*M+4*M-1, */
/* prefer M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[ir], &ldwrkr, &work[itaup]
, &work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of L in WORK(IR) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, &c__0, &c__0, &s[1], &work[ie], &
work[ir], &ldwrkr, dum, &c__1, dum, &c__1, &
work[iwork], info);
/* Multiply right singular vectors of L in WORK(IR) by */
/* Q in VT, storing result in A */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[ir], &ldwrkr,
&vt[vt_offset], ldvt, &c_b421, &a[a_offset],
lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need M+N, prefer M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &a[(
a_dim1 << 1) + 1], lda);
/* Bidiagonalize L in A */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &a[a_offset], *lda, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in A by Q */
/* in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, &c__0, &c__0, &s[1], &work[ie], &
vt[vt_offset], ldvt, dum, &c__1, dum, &c__1, &
work[iwork], info);
}
} else if (wntuo) {
/* Path 8t(N much larger than M, JOBU='O', JOBVT='A') */
/* N right singular vectors to be computed in VT and */
/* M left singular vectors to be overwritten on A */
/* Computing MAX */
i__2 = *n + *m, i__3 = *m << 2, i__2 = max(i__2,i__3);
if (*lwork >= (*m << 1) * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + (*lda << 1) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is LDA by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *lda;
} else if (*lwork >= wrkbl + (*lda + *m) * *m) {
/* WORK(IU) is LDA by M and WORK(IR) is M by M */
ldwrku = *lda;
ir = iu + ldwrku * *m;
ldwrkr = *m;
} else {
/* WORK(IU) is M by M and WORK(IR) is M by M */
ldwrku = *m;
ir = iu + ldwrku * *m;
ldwrkr = *m;
}
itau = ir + ldwrkr * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M*M+2*M, prefer 2*M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need 2*M*M+M+N, prefer 2*M*M+M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ ldwrku], &ldwrku);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to */
/* WORK(IR) */
/* (Workspace: need 2*M*M+4*M, */
/* prefer 2*M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[iu], ldwrku, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_slacpy("L", m, m, &work[iu], &ldwrku, &work[ir], &
ldwrkr);
/* Generate right bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need 2*M*M+4*M-1, */
/* prefer 2*M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in WORK(IR) */
/* (Workspace: need 2*M*M+4*M, prefer 2*M*M+3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &work[ir], &ldwrkr, &work[itauq]
, &work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of L in WORK(IR) and computing */
/* right singular vectors of L in WORK(IU) */
/* (Workspace: need 2*M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, m, &c__0, &s[1], &work[ie], &work[
iu], &ldwrku, &work[ir], &ldwrkr, dum, &c__1,
&work[iwork], info);
/* Multiply right singular vectors of L in WORK(IU) by */
/* Q in VT, storing result in A */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[iu], &ldwrku,
&vt[vt_offset], ldvt, &c_b421, &a[a_offset],
lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Copy left singular vectors of A from WORK(IR) to A */
lapackf77_slacpy("F", m, m, &work[ir], &ldwrkr, &a[a_offset],
lda);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need M+N, prefer M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Zero out above L in A */
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &a[(
a_dim1 << 1) + 1], lda);
/* Bidiagonalize L in A */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &a[a_offset], *lda, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in A by Q */
/* in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &a[a_offset], lda, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in A */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &a[a_offset], lda, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in A and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, dum, &
c__1, &work[iwork], info);
}
} else if (wntuas) {
/* Path 9t(N much larger than M, JOBU='S' or 'A', */
/* JOBVT='A') */
/* N right singular vectors to be computed in VT and */
/* M left singular vectors to be computed in U */
/* Computing MAX */
i__2 = *n + *m, i__3 = *m << 2, i__2 = max(i__2,i__3);
if (*lwork >= *m * *m + max(i__2,bdspac)) {
/* Sufficient workspace for a fast algorithm */
iu = 1;
if (*lwork >= wrkbl + *lda * *m) {
/* WORK(IU) is LDA by M */
ldwrku = *lda;
} else {
/* WORK(IU) is M by M */
ldwrku = *m;
}
itau = iu + ldwrku * *m;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need M*M+2*M, prefer M*M+M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need M*M+M+N, prefer M*M+M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy L to WORK(IU), zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &work[iu], &
ldwrku);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &work[iu
+ ldwrku], &ldwrku);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in WORK(IU), copying result to U */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &work[iu], ldwrku, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
lapackf77_slacpy("L", m, m, &work[iu], &ldwrku, &u[u_offset],
ldu);
/* Generate right bidiagonalizing vectors in WORK(IU) */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, m, m, &work[iu], &ldwrku, &work[itaup]
, &work[iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U */
/* (Workspace: need M*M+4*M, prefer M*M+3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of L in U and computing right */
/* singular vectors of L in WORK(IU) */
/* (Workspace: need M*M+BDSPAC) */
lapackf77_sbdsqr("U", m, m, m, &c__0, &s[1], &work[ie], &work[
iu], &ldwrku, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
/* Multiply right singular vectors of L in WORK(IU) by */
/* Q in VT, storing result in A */
/* (Workspace: need M*M) */
blasf77_sgemm("N", "N", m, n, m, &c_b443, &work[iu], &ldwrku,
&vt[vt_offset], ldvt, &c_b421, &a[a_offset],
lda);
/* Copy right singular vectors of A from A to VT */
lapackf77_slacpy("F", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
} else {
/* Insufficient workspace for a fast algorithm */
itau = 1;
iwork = itau + *m;
/* Compute A=L*Q, copying result to VT */
/* (Workspace: need 2*M, prefer M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sgelqf(m, n, &a[a_offset], lda, &work[itau], &work[
iwork], &i__2, &ierr);
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset],
ldvt);
/* Generate Q in VT */
/* (Workspace: need M+N, prefer M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorglq(n, n, m, &vt[vt_offset], ldvt, &work[itau], &
work[iwork], &i__2, &ierr);
/* Copy L to U, zeroing out above it */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &u[u_offset],
ldu);
i__2 = *m - 1;
i__3 = *m - 1;
lapackf77_slaset("U", &i__2, &i__3, &c_b421, &c_b421, &u[(
u_dim1 << 1) + 1], ldu);
ie = itau;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize L in U */
/* (Workspace: need 4*M, prefer 3*M+2*M*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *m, &u[u_offset], *ldu, &s[1], &work[ie], &
work[itauq], &work[itaup], &work[iwork],
i__2, &ierr);
/* Multiply right bidiagonalizing vectors in U by Q */
/* in VT */
/* (Workspace: need 3*M+N, prefer 3*M+N*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sormbr("P", "L", "T", m, n, m, &u[u_offset], ldu, &
work[itaup], &vt[vt_offset], ldvt, &work[
iwork], &i__2, &ierr);
/* Generate left bidiagonalizing vectors in U */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, m, &u[u_offset], ldu, &work[itauq],
&work[iwork], &i__2, &ierr);
iwork = ie + *m;
/* Perform bidiagonal QR iteration, computing left */
/* singular vectors of A in U and computing right */
/* singular vectors of A in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("U", m, n, m, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &
c__1, &work[iwork], info);
}
}
}
} else {
/* N .LT. MNTHR */
/* Path 10t(N greater than M, but not much larger) */
/* Reduce to bidiagonal form without LQ decomposition */
ie = 1;
itauq = ie + *m;
itaup = itauq + *m;
iwork = itaup + *m;
/* Bidiagonalize A */
/* (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB) */
i__2 = *lwork - iwork + 1;
magma_sgebrd(*m, *n, &a[a_offset], *lda, &s[1], &work[ie], &work[itauq], &
work[itaup], &work[iwork], i__2, &ierr);
if (wntuas) {
/* If left singular vectors desired in U, copy result to U */
/* and generate left bidiagonalizing vectors in U */
/* (Workspace: need 4*M-1, prefer 3*M+(M-1)*NB) */
lapackf77_slacpy("L", m, m, &a[a_offset], lda, &u[u_offset], ldu);
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, n, &u[u_offset], ldu, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvas) {
/* If right singular vectors desired in VT, copy result to */
/* VT and generate right bidiagonalizing vectors in VT */
/* (Workspace: need 3*M+NRVT, prefer 3*M+NRVT*NB) */
lapackf77_slacpy("U", m, n, &a[a_offset], lda, &vt[vt_offset], ldvt);
if (wntva) {
nrvt = *n;
}
if (wntvs) {
nrvt = *m;
}
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", &nrvt, n, m, &vt[vt_offset], ldvt, &work[itaup],
&work[iwork], &i__2, &ierr);
}
if (wntuo) {
/* If left singular vectors desired in A, generate left */
/* bidiagonalizing vectors in A */
/* (Workspace: need 4*M-1, prefer 3*M+(M-1)*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("Q", m, m, n, &a[a_offset], lda, &work[itauq], &work[
iwork], &i__2, &ierr);
}
if (wntvo) {
/* If right singular vectors desired in A, generate right */
/* bidiagonalizing vectors in A */
/* (Workspace: need 4*M, prefer 3*M+M*NB) */
i__2 = *lwork - iwork + 1;
lapackf77_sorgbr("P", m, n, m, &a[a_offset], lda, &work[itaup], &work[
iwork], &i__2, &ierr);
}
iwork = ie + *m;
if (wntuas || wntuo) {
nru = *m;
}
if (wntun) {
nru = 0;
}
if (wntvas || wntvo) {
ncvt = *n;
}
if (wntvn) {
ncvt = 0;
}
if (! wntuo && ! wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in U and computing right singular */
/* vectors in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &u[u_offset], ldu, dum, &c__1, &
work[iwork], info);
} else if (! wntuo && wntvo) {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in U and computing right singular */
/* vectors in A */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &work[ie], &a[
a_offset], lda, &u[u_offset], ldu, dum, &c__1, &work[
iwork], info);
} else {
/* Perform bidiagonal QR iteration, if desired, computing */
/* left singular vectors in A and computing right singular */
/* vectors in VT */
/* (Workspace: need BDSPAC) */
lapackf77_sbdsqr("L", m, &ncvt, &nru, &c__0, &s[1], &work[ie], &vt[
vt_offset], ldvt, &a[a_offset], lda, dum, &c__1, &
work[iwork], info);
}
}
}
/* If DBDSQR failed to converge, copy unconverged superdiagonals */
/* to WORK( 2:MINMN ) */
if (*info != 0) {
if (ie > 2) {
i__2 = minmn - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
work[i__ + 1] = work[i__ + ie - 1];
/* L50: */
}
}
if (ie < 2) {
for (i__ = minmn - 1; i__ >= 1; --i__) {
work[i__ + 1] = work[i__ + ie - 1];
/* L60: */
}
}
}
/* Undo scaling if necessary */
if (iscl == 1) {
if (anrm > bignum) {
lapackf77_slascl("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
minmn, &ierr);
}
if (*info != 0 && anrm > bignum) {
i__2 = minmn - 1;
lapackf77_slascl("G", &c__0, &c__0, &bignum, &anrm, &i__2, &c__1, &work[2],
&minmn, &ierr);
}
if (anrm < smlnum) {
lapackf77_slascl("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
minmn, &ierr);
}
if (*info != 0 && anrm < smlnum) {
i__2 = minmn - 1;
lapackf77_slascl("G", &c__0, &c__0, &smlnum, &anrm, &i__2, &c__1, &work[2],
&minmn, &ierr);
}
}
return *info;
} /* sgesvd_ */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgetrf ( magma_int_t  m,
magma_int_t  n,
float *  A,
magma_int_t  lda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 37 of file sgetrf.cpp.

References __func__, dA, inAT, lapackf77_sgetrf(), magma_device_sync(), magma_free(), magma_get_sgetrf_nb(), MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_sgemm, magma_sgetmatrix(), magma_sgetrf2_piv(), magma_sgetrf3_ooc(), magma_smalloc(), magma_ssetmatrix(), magma_strsm, MAGMA_SUCCESS, magma_xerbla(), magmablas_sinplace_transpose(), magmablas_spermute_long2(), magmablas_stranspose(), magmablas_stranspose2(), MagmaNoTrans, MagmaRight, MagmaUnit, MagmaUpper, max, min, and codegen::work.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges. This version does not
require work space on the GPU passed as input. GPU memory is allocated
in the routine.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb)
float *dAT, *dA, *da, *work;
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
magma_int_t iinfo, nb;
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (lda < max(1,m))
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
if ( (nb <= 1) || (nb >= min(m,n)) ) {
/* Use CPU code. */
lapackf77_sgetrf(&m, &n, a, &lda, ipiv, info);
} else {
/* Use hybrid blocked code. */
magma_int_t maxm, maxn, ldda, maxdim;
magma_int_t i, rows, cols, s = min(m, n)/nb;
char * num_gpus_char = getenv("MAGMA_NUM_GPUS");
magma_int_t num_gpus = 1;
if( num_gpus_char != NULL ) num_gpus = atoi(num_gpus_char);
if( num_gpus_char != NULL && num_gpus >= 1 ) {
/* call multi-GPU non-GPU-resident interface */
magma_int_t rval = magma_sgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_sgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//return magma_sgetrf3(num_gpus, m, n, a, lda, ipiv, info);
}
maxm = ((m + 31)/32)*32;
maxn = ((n + 31)/32)*32;
maxdim = max(maxm, maxn);
ldda = maxn;
work = a;
if (maxdim*maxdim < 2*maxm*maxn)
{
if (MAGMA_SUCCESS != magma_smalloc( &dA, nb*maxm + maxdim*maxdim )) {
/* alloc failed so call non-GPU-resident version */
magma_int_t rval = magma_sgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_sgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//magma_int_t rval = magma_sgetrf_ooc(m, n, a, lda, ipiv, info);
//if( *info == 0 ) magma_sgetrf_piv( m, n, a, lda, ipiv, info);
//return *info;
}
da = dA + nb*maxm;
ldda = maxdim;
magma_ssetmatrix( m, n, a, lda, da, ldda );
dAT = da;
magmablas_sinplace_transpose( dAT, ldda, ldda );
}
else
{
if (MAGMA_SUCCESS != magma_smalloc( &dA, (nb + maxn)*maxm )) {
/* alloc failed so call non-GPU-resident version */
magma_int_t rval = magma_sgetrf3_ooc(num_gpus, m, n, a, lda, ipiv, info);
if( *info >= 0 ) magma_sgetrf2_piv( num_gpus, m, n, a, lda, ipiv, info);
return *info;
//magma_int_t rval = magma_sgetrf_ooc(m, n, a, lda, ipiv, info);
//if( *info == 0 )magma_sgetrf_piv( m, n, a, lda, ipiv, info);
//return *info;
}
da = dA + nb*maxm;
magma_ssetmatrix( m, n, a, lda, da, maxm );