Using MAGMA without Intel MKL

Open discussion for MAGMA

Re: Using MAGMA without Intel MKL

Postby itzsid » Tue Nov 17, 2009 9:58 am

Hi,

I used the code of magma_dpotrf available in one of the other posts on this forum and made a file out of it:

Code: Select all
/*
    -- MAGMA (version 0.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       June 2009
*/

#include "cuda_runtime_api.h"
#include "cublas.h"
#include "magma.h"

#include <stdio.h>

//extern "C" int magma_get_sgeqrf_nb(int m);
        int magma_get_dpotrf_nb(int n){
                if (n <= 3328)
                        return 128;
                else if (n<=4256)
                        return 128;
                else
                        return 256;
        }

int magma_dpotrf(char *uplo, int *n, double *a, int *lda, double *work, int *info)
{
/*  -- MAGMA (version 0.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       June 2009

    Purpose   
    =======   

    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix A.   

    The factorization has the form   
       A = U**T * U,  if UPLO = 'U', or   
       A = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   

    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) DOUBLE array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U**T*U or A = L*L**T.   

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    WORK    (workspace) DOUBLE array on the GPU, dimension (N, N)
            (size to be reduced in upcoming versions).

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   

    =====================================================================   

       Test the input parameters.   

       Parameter adjustments */

    #define  a_ref(a_1,a_2) (a+(a_2)*a_dim1 + a_1)
    #define da_ref(a_1,a_2) (work+(a_2)*ldda + a_1)
    #define min(a,b)  (((a)<(b))?(a):(b))
    #define max(a,b)  (((a)>(b))?(a):(b))

    /* Table of constant values */
    static int c__1 = 1;
    static int c_n1 = -1;
    static double c_b13 = -1.f;
    static double c_b14 = 1.f;

    /* System generated locals */
    int a_dim1, a_offset, i__1, i__2, i__3, i__4, ldda;
    /* Local variables */
    static int j;

    long int upper = lsame_(uplo, "U");
    *info = 0;
    if (! upper && ! lsame_(uplo, "L")) {
      *info = -1;
    } else if (*n < 0) {
      *info = -2;
    } else if (*lda < max(1,*n)) {
      *info = -4;
    }
    if (*info != 0)
      return 0;

    static int jb;

    static cudaStream_t stream[2];
    cudaStreamCreate(&stream[0]);
    cudaStreamCreate(&stream[1]);
    cublasStatus status;

    a_dim1 = *lda;
    ldda   = *n;

    a_offset = 1 + a_dim1 * 1;
    a    -= a_offset;
    work -= (1 + (*n));

    int nb = magma_get_dpotrf_nb(*n);

    if (nb <= 1 || nb >= *n) {
   dpotrf_(uplo, n, a_ref(1, 1), lda, info);
    } else {

        /* Use blocked code. */
   if (upper) {
            /* Compute the Cholesky factorization A = U'*U. */
       i__1 = *n;
       i__2 = nb;
       for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
               /* Update and factorize the current diagonal block and test   
                  for non-positive-definiteness. Computing MIN */
      i__3 = nb, i__4 = *n - j + 1;
      jb = min(i__3,i__4);
      i__3 = j - 1;
      cublasSetMatrix(jb, i__4, sizeof(double),
                                a_ref(j,j), *lda, da_ref(j,j), ldda);
                cublasDsyrk('u', 't', jb, i__3, c_b13, da_ref(1,j),
                             ldda, c_b14, da_ref(j, j), ldda);
                cudaMemcpy2DAsync(  a_ref(1,j), (*lda)*sizeof(double),
               da_ref(1,j), ldda *sizeof(double),
                sizeof(double)*(j+jb-1), jb,
                cudaMemcpyDeviceToHost,stream[1]);

      if (j + jb <= *n) {
        i__3 = *n - j - jb + 1;
        i__4 = j - 1;
        cublasDgemm('T', 'N', jb, i__3, i__4,
           c_b13, da_ref(1, j), ldda, da_ref(1, j + jb), ldda,
           c_b14, da_ref(j, j + jb), ldda);
      }

      cudaStreamSynchronize(stream[1]);
      dpotrf_("Upper", &jb, a_ref(j,j), lda, info);
      if (*info != 0) {
        *info = *info + j - 1;
        break;
      }
      cudaMemcpy2DAsync(da_ref(j,j),  ldda * sizeof(double),
              a_ref( j,j), (*lda)* sizeof(double),
              sizeof(double)*jb, jb,
              cudaMemcpyHostToDevice,stream[0]);

      if (j + jb <= *n)
        cublasDtrsm('L', 'U', 'T', 'N', jb, i__3,
               c_b14, da_ref(j,j), ldda, da_ref(j, j+jb), ldda);
       }
   } else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
       i__2 = *n;
       i__1 = nb;
       for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {
                //  Update and factorize the current diagonal block and test   
                //  for non-positive-definiteness. Computing MIN
        i__3 = nb, i__4 = *n - j + 1;
      jb = min(i__3,i__4);
      i__3 = j - 1;
               cublasSetMatrix(i__4, jb, sizeof(double),
            a_ref(j,j), *lda, da_ref(j,j), ldda);
                cublasDsyrk('l', 'n', jb, i__3, c_b13, da_ref(j,1), ldda,
                            c_b14, da_ref(j, j), ldda);
      cudaMemcpy2DAsync( a_ref(j,1), (*lda)*sizeof(double),
               da_ref(j,1),  ldda *sizeof(double),
              sizeof(double)*jb, j+jb-1,
              cudaMemcpyDeviceToHost,stream[1]);

                if (j + jb <= *n) {
                    i__3 = *n - j - jb + 1;
                    i__4 = j - 1;
                    cublasDgemm('N', 'T', i__3, jb, i__4,
                            c_b13, da_ref(j + jb, 1), ldda, da_ref(j, 1), ldda,
                            c_b14, da_ref(j + jb, j), ldda);
                }

                cudaStreamSynchronize(stream[1]);
           dpotrf_("Lower", &jb, a_ref(j, j), lda, info);
      if (*info != 0){
                  *info = *info + j - 1;
        break;
      }
           cudaMemcpy2DAsync(da_ref(j,j),  ldda * sizeof(double),
              a_ref( j,j), (*lda)* sizeof(double),
              sizeof(double)*jb, jb,
              cudaMemcpyHostToDevice,stream[0]);

      if (j + jb <= *n)
        cublasDtrsm('R', 'L', 'T', 'N', i__3, jb, c_b14,
               da_ref(j, j), ldda, da_ref(j + jb, j), ldda);
       }
   }
    }

    return 0;

/*     End of MAGMA_DPOTRF */

} /* magma_dpotrf */

#undef a_ref
#undef da_ref
#undef min
#undef max

int dpotrf_cuda(char *uplo, int *n, double *a, int *lda, int *info) /* block version of dpotf2 cuda */
{

        cuInit( 0 );
        cublasInit( );
        double *d_A;
        cublasStatus status;
        status = cublasInit();
        if (status != CUBLAS_STATUS_SUCCESS) {
                fprintf (stderr, "!!!! CUBLAS initialization error\n");
        }
        int m = (int)*n;
        status = cublasAlloc(m*m, sizeof(double), (void**)&d_A);
        if (status != CUBLAS_STATUS_SUCCESS) {
                fprintf (stderr, "!!!! device memory allocation error (d_A)\n");
        }
//      magma_dpotrf(uplo, n, a, lda, d_A, info);
        cublasFree(d_A);

        /* Shutdown */
        status = cublasShutdown();
        if (status != CUBLAS_STATUS_SUCCESS) {
                fprintf (stderr, "!!!! shutdown error (A)\n");
        }

        return 0;

}



dpotrf_cuda is the interface to magma_dpotrf, which is like dpotrf_ lapack call.

The object file is made as follows:

Code: Select all
gcc -c -I/usr/local/cuda/include magma_dpotrf.c


Is everything right because I am not getting right values when I used dpotrf_cuda instead of dpotrf_ (lapack one) with the same arguments.
itzsid
 
Posts: 9
Joined: Sun Nov 15, 2009 2:34 am

Re: Using MAGMA without Intel MKL

Postby itzsid » Tue Nov 17, 2009 11:52 am

It worked, I forgot to remove the commented call.

Thanks a lot
itzsid
 
Posts: 9
Joined: Sun Nov 15, 2009 2:34 am

Re: Using MAGMA without Intel MKL

Postby Stan Tomov » Sat Nov 21, 2009 8:30 pm

We released MAGMA 0.2 that fixes the problem of calling it from "C".
Stan Tomov
 
Posts: 248
Joined: Fri Aug 21, 2009 10:39 pm

Previous

Return to User discussion

Who is online

Users browsing this forum: No registered users and 1 guest

cron