Just to follow up on my original question...
My usage of magma_dgeqrs is returning incorrect results. Below is a program, and its output, to illustrate the problem. In it, a symmetric positive-definite matrix A is defined and inverted using QR factorization. Specifically, magma_dgeqrf_gpu2 is used to compute the factorization, followed by calls to QR solvers in each of MAGMA and LAPACK. LAPACK returns the correct inverse; MAGMA does not (A %*% A^{-1} should be an identity matrix).
If there is a problem with my usage, I would be happy to hear about it. In terms of my code, it is not clear to me what values of lhwork1, lhwork2, and ldwork should be used. If my program is okay, then there may be a bug in the MAGMA routines.
Code: Select all
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cublas.h>
#include <magma.h>
/***************************************************************************
* Program to compute the inverse of a symmetric positive-definite matrix A
* using QR factorization.
* - magma_dgeqrs_gpu fails to compute the inverse correctly
* - LAPACK routines do it correctly
***************************************************************************/
void mprint(int m, int n, double *x, int ldx) {
int i, j;
for(i = 0; i < m; i++) {
for(j = 0; j < n; j++) {
printf("%f ", x[i + j * ldx]);
}
printf("\n");
}
}
int main(int argc, char** argv)
{
int m = 3, n = 3, nrhs = 3, lda = m, ldb = m,
nb = magma_get_dgeqrf_nb(m),
lhwork1 = (m - n + nb + 2 * nrhs) * nb,
lhwork2 = lhwork1,
ldwork = lhwork1,
info, i, j;
double A[9] = {1, 0.4, 0.2, 0.4, 1.0, 0.3, 0.2, 0.3, 1},
B[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1},
Ai[9], AAi[9], QR[9], tau[3], *d_A, *d_B, *h_work1, *h_work2, *d_work,
alpha, beta;
printf("Matrix A to be inverted:\n");
mprint(m, n, A, lda);
cublasInit();
cublasAlloc(m * n, sizeof(double), (void**)&d_A);
cublasAlloc(m * nrhs, sizeof(double), (void**)&d_B);
cublasAlloc(ldwork, sizeof(double), (void**)&d_work);
cudaMallocHost((void**)&h_work1, lhwork1 * sizeof(double));
cudaMallocHost((void**)&h_work2, lhwork2 * sizeof(double));
if(d_A == NULL || d_B == NULL || d_work == NULL|| h_work2 == NULL
|| h_work1 == NULL ) {
fprintf(stderr, "Error: CUDA memory allocation failed\n");
exit(1);
}
/***************************************************************************
* MAGMA QR factorization
* - Correctly returns factorization matrix (d_A/QR) and scalars (tau)
***************************************************************************/
printf("\nMAGMA Results\n=============\n");
cublasSetVector(m * n, sizeof(double), A, 1, d_A, 1);
magma_dgeqrf_gpu2(&m, &n, d_A, &lda, tau, h_work1, &lhwork1, d_work, &info);
printf("\nmagma_dgeqrf_gpu2 info = %d\n", info);
// Save d_A factorization to QR for later use in LAPACK routines
cublasGetVector(m * n, sizeof(double), d_A, 1, QR, 1);
/***************************************************************************
* MAGMA QR solver
* - Returns incorrect solution
***************************************************************************/
cublasSetVector(m * nrhs, sizeof(double), B, 1, d_B, 1);
magma_dgeqrs_gpu(&m, &n, &nrhs, d_A, &lda, tau, d_B, &ldb, h_work2, &lhwork2,
d_work, &info);
cublasGetMatrix(n, nrhs, sizeof(double), d_B, ldb, Ai, lda);
printf("magma_dgeqrs_gpu info = %d\n", info);
printf("\nA^{-1} (magma_dgeqrs_gpu):\n");
mprint(n, nrhs, Ai, lda);
alpha = 1.0; beta = 0.0;
dgemm_("N", "N", &m, &n, &nrhs, &alpha, A, &lda, Ai, &lda, &beta, AAi, &lda);
printf("\nA %%*%% A^{-1}:\n");
mprint(m, nrhs, AAi, lda);
/***************************************************************************
* LAPACK QR solver
* - Returns correct solution
***************************************************************************/
printf("\nLAPACK Results\n==============\n");
alpha = 1.0;
dormqr_("L", "T", &m, &nrhs, &n, QR, &lda, tau, B, &ldb, h_work2, &lhwork2, &info);
dtrsm_("L", "U", "N", "N", &m, &nrhs, &alpha, QR, &lda, B, &ldb);
printf("\ndrmqr info = %d\n", info);
printf("\nA^{-1} (dormqr/dtrsm):\n");
mprint(n, nrhs, B, ldb);
alpha = 1.0; beta = 0.0;
dgemm_("N", "N", &m, &n, &nrhs, &alpha, A, &lda, B, &ldb, &beta, AAi, &lda);
printf("\nA %%*%% A^{-1}:\n");
mprint(m, nrhs, AAi, lda);
cublasFree(d_A);
cublasFree(d_B);
cublasFree(d_work);
cudaFreeHost(h_work1);
cudaFreeHost(h_work2);
cublasShutdown();
return(EXIT_SUCCESS);
}
Program output:
Matrix A to be inverted:
1.000000 0.400000 0.200000
0.400000 1.000000 0.300000
0.200000 0.300000 1.000000
MAGMA Results
=============
magma_dgeqrf_gpu2 info = 0
magma_dgeqrs_gpu info = 0
A^{-1} (magma_dgeqrs_gpu):
1.200528 0.365148 0.182574
-0.448549 0.896112 0.196810
-0.105541 -0.252291 0.963293
A %*% A^{-1}:
1.000000 0.673135 0.453957
0.000000 0.966484 0.558827
0.000000 0.089572 1.058850
LAPACK Results
==============
drmqr info = 0
A^{-1} (dormqr/dtrsm):
1.200528 -0.448549 -0.105541
-0.448549 1.266491 -0.290237
-0.105541 -0.290237 1.108179
A %*% A^{-1}:
1.000000 0.000000 -0.000000
0.000000 1.000000 0.000000
0.000000 -0.000000 1.000000