I used the code of magma_dpotrf available in one of the other posts on this forum and made a file out of it:
- Code: Select all
/*
-- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
*/
#include "cuda_runtime_api.h"
#include "cublas.h"
#include "magma.h"
#include <stdio.h>
//extern "C" int magma_get_sgeqrf_nb(int m);
int magma_get_dpotrf_nb(int n){
if (n <= 3328)
return 128;
else if (n<=4256)
return 128;
else
return 256;
}
int magma_dpotrf(char *uplo, int *n, double *a, int *lda, double *work, int *info)
{
/* -- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
June 2009
Purpose
=======
DPOTRF computes the Cholesky factorization of a real symmetric
positive definite matrix A.
The factorization has the form
A = U**T * U, if UPLO = 'U', or
A = L * L**T, if UPLO = 'L',
where U is an upper triangular matrix and L is lower triangular.
This is the block version of the algorithm, calling Level 3 BLAS.
Arguments
=========
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) DOUBLE array, dimension (LDA,N)
On entry, the symmetric matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if INFO = 0, the factor U or L from the Cholesky
factorization A = U**T*U or A = L*L**T.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using cudaMallocHost.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
WORK (workspace) DOUBLE array on the GPU, dimension (N, N)
(size to be reduced in upcoming versions).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i, the leading minor of order i is not
positive definite, and the factorization could not be
completed.
=====================================================================
Test the input parameters.
Parameter adjustments */
#define a_ref(a_1,a_2) (a+(a_2)*a_dim1 + a_1)
#define da_ref(a_1,a_2) (work+(a_2)*ldda + a_1)
#define min(a,b) (((a)<(b))?(a):(b))
#define max(a,b) (((a)>(b))?(a):(b))
/* Table of constant values */
static int c__1 = 1;
static int c_n1 = -1;
static double c_b13 = -1.f;
static double c_b14 = 1.f;
/* System generated locals */
int a_dim1, a_offset, i__1, i__2, i__3, i__4, ldda;
/* Local variables */
static int j;
long int upper = lsame_(uplo, "U");
*info = 0;
if (! upper && ! lsame_(uplo, "L")) {
*info = -1;
} else if (*n < 0) {
*info = -2;
} else if (*lda < max(1,*n)) {
*info = -4;
}
if (*info != 0)
return 0;
static int jb;
static cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
cublasStatus status;
a_dim1 = *lda;
ldda = *n;
a_offset = 1 + a_dim1 * 1;
a -= a_offset;
work -= (1 + (*n));
int nb = magma_get_dpotrf_nb(*n);
if (nb <= 1 || nb >= *n) {
dpotrf_(uplo, n, a_ref(1, 1), lda, info);
} else {
/* Use blocked code. */
if (upper) {
/* Compute the Cholesky factorization A = U'*U. */
i__1 = *n;
i__2 = nb;
for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
/* Update and factorize the current diagonal block and test
for non-positive-definiteness. Computing MIN */
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(jb, i__4, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('u', 't', jb, i__3, c_b13, da_ref(1,j),
ldda, c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync( a_ref(1,j), (*lda)*sizeof(double),
da_ref(1,j), ldda *sizeof(double),
sizeof(double)*(j+jb-1), jb,
cudaMemcpyDeviceToHost,stream[1]);
if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('T', 'N', jb, i__3, i__4,
c_b13, da_ref(1, j), ldda, da_ref(1, j + jb), ldda,
c_b14, da_ref(j, j + jb), ldda);
}
cudaStreamSynchronize(stream[1]);
dpotrf_("Upper", &jb, a_ref(j,j), lda, info);
if (*info != 0) {
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j), ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);
if (j + jb <= *n)
cublasDtrsm('L', 'U', 'T', 'N', jb, i__3,
c_b14, da_ref(j,j), ldda, da_ref(j, j+jb), ldda);
}
} else {
//=========================================================
// Compute the Cholesky factorization A = L*L'.
i__2 = *n;
i__1 = nb;
for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
// Update and factorize the current diagonal block and test
// for non-positive-definiteness. Computing MIN
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(i__4, jb, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('l', 'n', jb, i__3, c_b13, da_ref(j,1), ldda,
c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync( a_ref(j,1), (*lda)*sizeof(double),
da_ref(j,1), ldda *sizeof(double),
sizeof(double)*jb, j+jb-1,
cudaMemcpyDeviceToHost,stream[1]);
if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('N', 'T', i__3, jb, i__4,
c_b13, da_ref(j + jb, 1), ldda, da_ref(j, 1), ldda,
c_b14, da_ref(j + jb, j), ldda);
}
cudaStreamSynchronize(stream[1]);
dpotrf_("Lower", &jb, a_ref(j, j), lda, info);
if (*info != 0){
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j), ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);
if (j + jb <= *n)
cublasDtrsm('R', 'L', 'T', 'N', i__3, jb, c_b14,
da_ref(j, j), ldda, da_ref(j + jb, j), ldda);
}
}
}
return 0;
/* End of MAGMA_DPOTRF */
} /* magma_dpotrf */
#undef a_ref
#undef da_ref
#undef min
#undef max
int dpotrf_cuda(char *uplo, int *n, double *a, int *lda, int *info) /* block version of dpotf2 cuda */
{
cuInit( 0 );
cublasInit( );
double *d_A;
cublasStatus status;
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
}
int m = (int)*n;
status = cublasAlloc(m*m, sizeof(double), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (d_A)\n");
}
// magma_dpotrf(uplo, n, a, lda, d_A, info);
cublasFree(d_A);
/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
}
return 0;
}
dpotrf_cuda is the interface to magma_dpotrf, which is like dpotrf_ lapack call.
The object file is made as follows:
- Code: Select all
gcc -c -I/usr/local/cuda/include magma_dpotrf.c
Is everything right because I am not getting right values when I used dpotrf_cuda instead of dpotrf_ (lapack one) with the same arguments.