## Using MAGMA without Intel MKL

Open discussion for MAGMA library (Matrix Algebra on GPU and Multicore Architectures)
itzsid
Posts: 9
Joined: Sun Nov 15, 2009 2:34 am

### Re: Using MAGMA without Intel MKL

Hi,

I used the code of magma_dpotrf available in one of the other posts on this forum and made a file out of it:

Code: Select all

``````/*
-- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
June 2009
*/

#include "cuda_runtime_api.h"
#include "cublas.h"
#include "magma.h"

#include <stdio.h>

//extern "C" int magma_get_sgeqrf_nb(int m);
int magma_get_dpotrf_nb(int n){
if (n <= 3328)
return 128;
else if (n<=4256)
return 128;
else
return 256;
}

int magma_dpotrf(char *uplo, int *n, double *a, int *lda, double *work, int *info)
{
/*  -- MAGMA (version 0.1) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
June 2009

Purpose
=======

DPOTRF computes the Cholesky factorization of a real symmetric
positive definite matrix A.

The factorization has the form
A = U**T * U,  if UPLO = 'U', or
A = L  * L**T,  if UPLO = 'L',
where U is an upper triangular matrix and L is lower triangular.

This is the block version of the algorithm, calling Level 3 BLAS.

Arguments
=========

UPLO    (input) CHARACTER*1
= 'U':  Upper triangle of A is stored;
= 'L':  Lower triangle of A is stored.

N       (input) INTEGER
The order of the matrix A.  N >= 0.

A       (input/output) DOUBLE array, dimension (LDA,N)
On entry, the symmetric matrix A.  If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced.  If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.

On exit, if INFO = 0, the factor U or L from the Cholesky
factorization A = U**T*U or A = L*L**T.

Higher performance is achieved if A is in pinned memory, e.g.
allocated using cudaMallocHost.

LDA     (input) INTEGER
The leading dimension of the array A.  LDA >= max(1,N).

WORK    (workspace) DOUBLE array on the GPU, dimension (N, N)
(size to be reduced in upcoming versions).

INFO    (output) INTEGER
= 0:  successful exit
< 0:  if INFO = -i, the i-th argument had an illegal value
> 0:  if INFO = i, the leading minor of order i is not
positive definite, and the factorization could not be
completed.

=====================================================================

Test the input parameters.

#define  a_ref(a_1,a_2) (a+(a_2)*a_dim1 + a_1)
#define da_ref(a_1,a_2) (work+(a_2)*ldda + a_1)
#define min(a,b)  (((a)<(b))?(a):(b))
#define max(a,b)  (((a)>(b))?(a):(b))

/* Table of constant values */
static int c__1 = 1;
static int c_n1 = -1;
static double c_b13 = -1.f;
static double c_b14 = 1.f;

/* System generated locals */
int a_dim1, a_offset, i__1, i__2, i__3, i__4, ldda;
/* Local variables */
static int j;

long int upper = lsame_(uplo, "U");
*info = 0;
if (! upper && ! lsame_(uplo, "L")) {
*info = -1;
} else if (*n < 0) {
*info = -2;
} else if (*lda < max(1,*n)) {
*info = -4;
}
if (*info != 0)
return 0;

static int jb;

static cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
cublasStatus status;

a_dim1 = *lda;
ldda   = *n;

a_offset = 1 + a_dim1 * 1;
a    -= a_offset;
work -= (1 + (*n));

int nb = magma_get_dpotrf_nb(*n);

if (nb <= 1 || nb >= *n) {
dpotrf_(uplo, n, a_ref(1, 1), lda, info);
} else {

/* Use blocked code. */
if (upper) {
/* Compute the Cholesky factorization A = U'*U. */
i__1 = *n;
i__2 = nb;
for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
/* Update and factorize the current diagonal block and test
for non-positive-definiteness. Computing MIN */
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(jb, i__4, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('u', 't', jb, i__3, c_b13, da_ref(1,j),
ldda, c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync(  a_ref(1,j), (*lda)*sizeof(double),
da_ref(1,j), ldda *sizeof(double),
sizeof(double)*(j+jb-1), jb,
cudaMemcpyDeviceToHost,stream[1]);

if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('T', 'N', jb, i__3, i__4,
c_b13, da_ref(1, j), ldda, da_ref(1, j + jb), ldda,
c_b14, da_ref(j, j + jb), ldda);
}

cudaStreamSynchronize(stream[1]);
dpotrf_("Upper", &jb, a_ref(j,j), lda, info);
if (*info != 0) {
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j),  ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);

if (j + jb <= *n)
cublasDtrsm('L', 'U', 'T', 'N', jb, i__3,
c_b14, da_ref(j,j), ldda, da_ref(j, j+jb), ldda);
}
} else {
//=========================================================
// Compute the Cholesky factorization A = L*L'.
i__2 = *n;
i__1 = nb;
for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
//  Update and factorize the current diagonal block and test
//  for non-positive-definiteness. Computing MIN
i__3 = nb, i__4 = *n - j + 1;
jb = min(i__3,i__4);
i__3 = j - 1;
cublasSetMatrix(i__4, jb, sizeof(double),
a_ref(j,j), *lda, da_ref(j,j), ldda);
cublasDsyrk('l', 'n', jb, i__3, c_b13, da_ref(j,1), ldda,
c_b14, da_ref(j, j), ldda);
cudaMemcpy2DAsync( a_ref(j,1), (*lda)*sizeof(double),
da_ref(j,1),  ldda *sizeof(double),
sizeof(double)*jb, j+jb-1,
cudaMemcpyDeviceToHost,stream[1]);

if (j + jb <= *n) {
i__3 = *n - j - jb + 1;
i__4 = j - 1;
cublasDgemm('N', 'T', i__3, jb, i__4,
c_b13, da_ref(j + jb, 1), ldda, da_ref(j, 1), ldda,
c_b14, da_ref(j + jb, j), ldda);
}

cudaStreamSynchronize(stream[1]);
dpotrf_("Lower", &jb, a_ref(j, j), lda, info);
if (*info != 0){
*info = *info + j - 1;
break;
}
cudaMemcpy2DAsync(da_ref(j,j),  ldda * sizeof(double),
a_ref( j,j), (*lda)* sizeof(double),
sizeof(double)*jb, jb,
cudaMemcpyHostToDevice,stream[0]);

if (j + jb <= *n)
cublasDtrsm('R', 'L', 'T', 'N', i__3, jb, c_b14,
da_ref(j, j), ldda, da_ref(j + jb, j), ldda);
}
}
}

return 0;

/*     End of MAGMA_DPOTRF */

} /* magma_dpotrf */

#undef a_ref
#undef da_ref
#undef min
#undef max

int dpotrf_cuda(char *uplo, int *n, double *a, int *lda, int *info) /* block version of dpotf2 cuda */
{

cuInit( 0 );
cublasInit( );
double *d_A;
cublasStatus status;
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
}
int m = (int)*n;
status = cublasAlloc(m*m, sizeof(double), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (d_A)\n");
}
//      magma_dpotrf(uplo, n, a, lda, d_A, info);
cublasFree(d_A);

/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
}

return 0;

}

``````
dpotrf_cuda is the interface to magma_dpotrf, which is like dpotrf_ lapack call.

The object file is made as follows:

Code: Select all

``````gcc -c -I/usr/local/cuda/include magma_dpotrf.c
``````
Is everything right because I am not getting right values when I used dpotrf_cuda instead of dpotrf_ (lapack one) with the same arguments.

itzsid
Posts: 9
Joined: Sun Nov 15, 2009 2:34 am

### Re: Using MAGMA without Intel MKL

It worked, I forgot to remove the commented call.

Thanks a lot

Stan Tomov
Posts: 283
Joined: Fri Aug 21, 2009 10:39 pm

### Re: Using MAGMA without Intel MKL

We released MAGMA 0.2 that fixes the problem of calling it from "C".