Below is my minimal code to test magma_dgeqp3:

- Code: Select all
`# include <stdio.h>`

# include <cuda.h>

# include "magma.h"

# include "magma_lapack.h"

# define min(a, b ) ((( a ) <( b ))?( a ):( b ))

# define max(a, b ) ((( a ) <( b ))?( b ):( a ))

int main( int argc , char ** argv )

{

magma_init(); // initialize Magma

double gpu_time = 0.0 , cpu_time = 0.0;

magma_int_t m = 100 , n = m, n2=m*n;

double *a, *r; // a, r - mxn matrices on the host

double * h_work ; // workspace

double *tau ; // scalars defining the elementary reflectors

magma_int_t * jpvt ; // pivoting information

magma_int_t i, j, info, nb;

magma_int_t min_mn = min(m, n);

magma_int_t ione = 1 , lwork ; // lwork - workspace size

magma_int_t ISEED [4] = {0 ,0 ,0 ,1}; // seed

double c_neg_one = MAGMA_D_NEG_ONE ;

nb = magma_get_dgeqp3_nb( m, n ); // optimal blocksize

jpvt =( magma_int_t *) malloc(n* sizeof( magma_int_t )); // host mem .

// for jpvt

magma_dmalloc_cpu(& tau , min_mn ); // host memory for tau

magma_dmalloc_pinned(&a,n2 ); // host memory for a

magma_dmalloc_pinned(&r,n2 ); // host memory for r

lwork = 2*n + ( n+1 )* nb;

lwork = max(lwork , m * n + n);

magma_dmalloc_cpu(& h_work , lwork ); // host memory for h_work

// Random matrix a, copy a -> r

lapackf77_dlarnv(& ione ,ISEED ,&n2 ,a);

lapackf77_dlacpy( MagmaUpperLowerStr ,&m ,&n,a ,&m,r ,&m); // a- >r

// MAGMA

lapackf77_dlacpy( MagmaUpperLowerStr ,&m ,&n,a ,&m,r ,&m);

for (j = 0; j < n; j++)

jpvt[j] = 0 ;

// QR decomposition with column pivoting , Magma version

magma_dgeqp3(m,n,r,m,jpvt,tau,h_work,lwork,&info);

printf("info = %d \n", info);

printf(" MAGMA time : %7.3f sec .\n",gpu_time ); // Magma time

// Free memory

free( jpvt ); // free host memory

free( tau ); // free host memory

magma_free_pinned(a); // free host memory

magma_free_pinned(r); // free host memory

free( h_work ); // free host memory

magma_finalize( ); // finalize Magma

return EXIT_SUCCESS ;

}

and the Makefile I am using:

- Code: Select all
`# Definitions of variables`

CC = nvcc

CCFLAGS = -O

LD = nvcc

LDFLAGS = -O

GENCODE_FLAGS = -arch=sm_35 -gencode arch=compute_35,code=compute_35

# Definitions of rules

testing-dgeqp3.x : testing-dgeqp3.o

@$(LD) $(GENCODE_FLAGS) \

-o testing-dgeqp3.x \

testing-dgeqp3.o \

-L/usr/local/cuda/lib64 \

-L/usr/local/magma/lib \

-lmagma -lcudart -lcusolver -lcublas -lgomp \

/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a \

/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.a \

/opt/intel/mkl/lib/intel64/libmkl_core.a \

-ldl -lpthread -lgomp

testing-dgeqp3.o : testing-dgeqp3.cpp

@$(CC) $(GENCODE_FLAGS) -c testing-dgeqp3.cpp \

-I/opt/intel/mkl/include \

-I/usr/local/magma/include \

-DADD_

clean : FORCE

rm -f a.out *.o *~ core

FORCE :

Any thoughts would be greatly appreciated, thanks!