Functions
magma_int_t	magma_cgetrf (magma_int_t m, magma_int_t n, magmaFloatComplex A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_cgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloatComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t ipiv, magmaFloatComplex_ptr d_lAP[], magmaFloatComplex W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
	CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_cgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_int_t nb, magma_mode_t mode)
	CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_cgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_cgetrf_gpu_expert with mode = MagmaHybrid. More...

magma_int_t	magma_cgetrf_native (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_cgetrf_gpu_expert with mode = MagmaNative. More...

magma_int_t	magma_cgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_cgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_dgetrf (magma_int_t m, magma_int_t n, double A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_dgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDouble_ptr d_lAT[], magma_int_t lddat, magma_int_t ipiv, magmaDouble_ptr d_lAP[], double W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
	DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_dgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_int_t nb, magma_mode_t mode)
	DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_dgetrf_gpu (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_dgetrf_gpu_expert with mode = MagmaHybrid. More...

magma_int_t	magma_dgetrf_native (magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_dgetrf_gpu_expert with mode = MagmaNative. More...

magma_int_t	magma_dgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, double A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_dgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_sgetrf (magma_int_t m, magma_int_t n, float A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_sgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloat_ptr d_lAT[], magma_int_t lddat, magma_int_t ipiv, magmaFloat_ptr d_lAP[], float W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
	SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_sgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_int_t nb, magma_mode_t mode)
	SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_sgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_sgetrf_gpu_expert with mode = MagmaHybrid. More...

magma_int_t	magma_sgetrf_native (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_sgetrf_gpu_expert with mode = MagmaNative. More...

magma_int_t	magma_sgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, float A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_sgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_xhsgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_mp_type_t enable_tc, magma_mp_type_t mp_algo_type)
	XHSGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_hgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	HGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_xshgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_mp_type_t enable_tc, magma_mp_type_t mp_algo_type)
	XSHGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_htgetrf_gpu (magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	HTGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_zgetrf (magma_int_t m, magma_int_t n, magmaDoubleComplex A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_zgetrf2_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t ipiv, magmaDoubleComplex_ptr d_lAP[], magmaDoubleComplex W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info)
	ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_zgetrf_gpu_expert (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info, magma_int_t nb, magma_mode_t mode)
	ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_zgetrf_gpu (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_zgetrf_gpu_expert with mode = MagmaHybrid. More...

magma_int_t	magma_zgetrf_native (magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	magma_zgetrf_gpu_expert with mode = MagmaNative. More...

magma_int_t	magma_zgetrf_m (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex A, magma_int_t lda, magma_int_t ipiv, magma_int_t *info)
	ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

magma_int_t	magma_zgetrf_mgpu (magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr d_lA[], magma_int_t ldda, magma_int_t ipiv, magma_int_t info)
	ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. More...

Detailed Description

Function Documentation

magma_int_t magma_cgetrf	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_cgetrf2_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magma_int_t	offset,
		magmaFloatComplex_ptr	d_lAT[],
		magma_int_t	lddat,
		magma_int_t *	ipiv,
		magmaFloatComplex_ptr	d_lAP[],
		magmaFloatComplex *	W,
		magma_int_t	ldw,
		magma_queue_t	queues[][2],
		magma_int_t *	info
	)

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in]	nb	INTEGER The block size used for the matrix distribution.
[in]	offset	INTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]	d_lAT	COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	lddat	INTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]	d_lAP	COMPLEX array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)nbmaxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]	W	COMPLEX array, dimension (ngpunbmaxm). It is used to store panel on CPU.
[in]	ldw	INTEGER The leading dimension of the workspace w.
[in]	queues	magma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_cgetrf_gpu_expert	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_int_t	nb,
		magma_mode_t	mode
	)

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	mode	magma_mode_t = MagmaNative: Factorize dA using GPU only mode. = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.

magma_int_t magma_cgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_cgetrf_gpu_expert with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See Also: magma_cgetrf_gpu_expert

magma_int_t magma_cgetrf_native	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_cgetrf_gpu_expert with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See Also: magma_cgetrf_gpu_expert

magma_int_t magma_cgetrf_m	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_cgetrf_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaFloatComplex_ptr	d_lA[],
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	d_lA	COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_dgetrf	(	magma_int_t	m,
		magma_int_t	n,
		double *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_dgetrf2_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magma_int_t	offset,
		magmaDouble_ptr	d_lAT[],
		magma_int_t	lddat,
		magma_int_t *	ipiv,
		magmaDouble_ptr	d_lAP[],
		double *	W,
		magma_int_t	ldw,
		magma_queue_t	queues[][2],
		magma_int_t *	info
	)

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in]	nb	INTEGER The block size used for the matrix distribution.
[in]	offset	INTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]	d_lAT	DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	lddat	INTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]	d_lAP	DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)nbmaxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]	W	DOUBLE PRECISION array, dimension (ngpunbmaxm). It is used to store panel on CPU.
[in]	ldw	INTEGER The leading dimension of the workspace w.
[in]	queues	magma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_dgetrf_gpu_expert	(	magma_int_t	m,
		magma_int_t	n,
		magmaDouble_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_int_t	nb,
		magma_mode_t	mode
	)

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	DOUBLE PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	mode	magma_mode_t = MagmaNative: Factorize dA using GPU only mode. = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.

magma_int_t magma_dgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaDouble_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_dgetrf_gpu_expert with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See Also: magma_dgetrf_gpu_expert

magma_int_t magma_dgetrf_native	(	magma_int_t	m,
		magma_int_t	n,
		magmaDouble_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_dgetrf_gpu_expert with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See Also: magma_dgetrf_gpu_expert

magma_int_t magma_dgetrf_m	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		double *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_dgetrf_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaDouble_ptr	d_lA[],
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	d_lA	DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_sgetrf	(	magma_int_t	m,
		magma_int_t	n,
		float *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_sgetrf2_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magma_int_t	offset,
		magmaFloat_ptr	d_lAT[],
		magma_int_t	lddat,
		magma_int_t *	ipiv,
		magmaFloat_ptr	d_lAP[],
		float *	W,
		magma_int_t	ldw,
		magma_queue_t	queues[][2],
		magma_int_t *	info
	)

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in]	nb	INTEGER The block size used for the matrix distribution.
[in]	offset	INTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]	d_lAT	REAL array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	lddat	INTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]	d_lAP	REAL array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)nbmaxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]	W	REAL array, dimension (ngpunbmaxm). It is used to store panel on CPU.
[in]	ldw	INTEGER The leading dimension of the workspace w.
[in]	queues	magma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_sgetrf_gpu_expert	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_int_t	nb,
		magma_mode_t	mode
	)

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	mode	magma_mode_t = MagmaNative: Factorize dA using GPU only mode. = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.

magma_int_t magma_sgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_sgetrf_gpu_expert with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See Also: magma_sgetrf_gpu_expert

magma_int_t magma_sgetrf_native	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_sgetrf_gpu_expert with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See Also: magma_sgetrf_gpu_expert

magma_int_t magma_sgetrf_m	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		float *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_sgetrf_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	d_lA[],
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	d_lA	REAL array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_xhsgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_mp_type_t	enable_tc,
		magma_mp_type_t	mp_algo_type
	)

XHSGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-w/o TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	enable_tc	MAGMA_MP_TYPE_T internal and expert API uses. enable/disable tensor cores
[in]	mp_algo_type	MAGMA_MP_TYPE_T internal and expert API uses.

magma_int_t magma_hgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

HGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16 techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

More details can be found in Azzam Haidar, Stanimire Tomov, Jack Dongarra, and Nicholas J. Higham. 2018. Harnessing GPU tensor cores for fast FP16 arithmetic to speed up mixed-precision iterative refinement solvers. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (SC '18). IEEE Press, Piscataway, NJ, USA, Article 47, 11 pages.

magma_int_t magma_xshgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_mp_type_t	enable_tc,
		magma_mp_type_t	mp_algo_type
	)

XSHGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-w/o TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	enable_tc	MAGMA_MP_TYPE_T internal and expert API uses. enable/disable tensor cores
[in]	mp_algo_type	MAGMA_MP_TYPE_T internal and expert API uses.

More details can be found in Azzam Haidar, Stanimire Tomov, Jack Dongarra, and Nicholas J. Higham. 2018. Harnessing GPU tensor cores for fast FP16 arithmetic to speed up mixed-precision iterative refinement solvers. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (SC '18). IEEE Press, Piscataway, NJ, USA, Article 47, 11 pages.

magma_int_t magma_htgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaFloat_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

HTGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

It uses mixed precision FP32/FP16-TensorCores factorization techniques.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_zgetrf	(	magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

It uses 2 queues to overlap communication and computation.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_zgetrf2_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magma_int_t	nb,
		magma_int_t	offset,
		magmaDoubleComplex_ptr	d_lAT[],
		magma_int_t	lddat,
		magma_int_t *	ipiv,
		magmaDoubleComplex_ptr	d_lAP[],
		magmaDoubleComplex *	W,
		magma_int_t	ldw,
		magma_queue_t	queues[][2],
		magma_int_t *	info
	)

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in]	nb	INTEGER The block size used for the matrix distribution.
[in]	offset	INTEGER The first row and column indices of the submatrix that this routine will factorize.
[in,out]	d_lAT	COMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	lddat	INTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[in]	d_lAP	COMPLEX_16 array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)nbmaxm, where maxm is m rounded up to a multiple of 32 and nb is the block size.
[in]	W	COMPLEX_16 array, dimension (ngpunbmaxm). It is used to store panel on CPU.
[in]	ldw	INTEGER The leading dimension of the workspace w.
[in]	queues	magma_queue_t queues[d] points to the queues for the d-th GPU to execute in. Each GPU require two queues.
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_zgetrf_gpu_expert	(	magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info,
		magma_int_t	nb,
		magma_mode_t	mode
	)

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	dA	COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array A. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.
[in]	mode	magma_mode_t = MagmaNative: Factorize dA using GPU only mode. = MagmaHybrid: Factorize dA using Hybrid (CPU/GPU) mode.

magma_int_t magma_zgetrf_gpu	(	magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_zgetrf_gpu_expert with mode = MagmaHybrid.

Computation is hybrid, part on CPU (panels), part on GPU (matrix updates).

See Also: magma_zgetrf_gpu_expert

magma_int_t magma_zgetrf_native	(	magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex_ptr	dA,
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

magma_zgetrf_gpu_expert with mode = MagmaNative.

Computation is done only on the GPU, not on the CPU.

See Also: magma_zgetrf_gpu_expert

magma_int_t magma_zgetrf_m	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex *	A,
		magma_int_t	lda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	A	COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned.
[in]	lda	INTEGER The leading dimension of the array A. LDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

magma_int_t magma_zgetrf_mgpu	(	magma_int_t	ngpu,
		magma_int_t	m,
		magma_int_t	n,
		magmaDoubleComplex_ptr	d_lA[],
		magma_int_t	ldda,
		magma_int_t *	ipiv,
		magma_int_t *	info
	)

ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.

The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n).

This is the right-looking Level 3 BLAS version of the algorithm.

Parameters

[in]	ngpu	INTEGER Number of GPUs to use. ngpu > 0.
[in]	m	INTEGER The number of rows of the matrix A. M >= 0.
[in]	n	INTEGER The number of columns of the matrix A. N >= 0.
[in,out]	d_lA	COMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = PLU; the unit diagonal elements of L are not stored.
[in]	ldda	INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M).
[out]	ipiv	INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
[out]	info	INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.

Functions

Detailed Description

Function Documentation