- Code: Select all
#//////////////////////////////////////////////////////////////////////////////
# -- MAGMA (version 1.0) --
# Univ. of Tennessee, Knoxville
# Univ. of California, Berkeley
# Univ. of Colorado, Denver
# November 2010
#//////////////////////////////////////////////////////////////////////////////
#
# GPU_TARGET specifies for which GPU you want to compile MAGMA
# 0: Tesla family
# 1: Fermi Family
#
GPU_TARGET = 1
CC = gcc
NVCC = nvcc
FORT = gfortran
ARCH = ar
ARCHFLAGS = cr
RANLIB = ranlib
OPTS = -O3 -DADD_ -fPIC
NVOPTS = --compiler-options -fno-strict-aliasing -DUNIX -O3 -DADD_ -Xcompiler "-fPIC -D_GNU_SOURCE -pthread -fexceptions -m64"
LDOPTS = -fPIC -z muldefs
LIB = -lmkl_gf_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lcublas -lm
CUDADIR = /usr/local/cuda
LIBDIR = -L/opt/intel/Compiler/11.0/074/mkl/lib/em64t/ \
-L$(CUDADIR)/lib64
INC = -I$(CUDADIR)/include
LIBMAGMA = ../lib/libmagma.a
LIBMAGMABLAS = ../lib/libmagmablas.a
As you see, this uses the intel MKL.
./testing_sgemm gives:
- Code: Select all
Usage:
testing_sgemm [-NN|NT|TN|TT] [-N 1024]
device 0: GeForce GTX 480, 1401.0 MHz clock, 1535.7 MB memory
device 1: GeForce 8400 GS, 1400.0 MHz clock, 511.7 MB memory
Testing TRANSA = N TRANSB = N
N MAGMA GFLop/s CUBLAS GFlop/s error
========================================================
1024 675.10 635.91 0.000000e+00
2048 774.11 765.39 0.000000e+00
3072 837.44 831.38 0.000000e+00
4096 831.06 802.14 0.000000e+00
5120 827.31 822.87 0.000000e+00
6144 847.68 840.57 0.000000e+00
7168 843.59 820.54 0.000000e+00
8192 837.13 833.47 0.000000e+00
The various -fPIC's and the -Xcompiler "-fPIC -D_GNU_SOURCE -pthread -fexceptions -m64" were so that I could compile a small test code for sgesv for matlab - a mex file. This code is:
- Code: Select all
#include "mex.h"
#include "cuda.h"
#include "cublas.h"
#include "magma.h"
#include "sys/time.h"
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int I,L;
int Ic,Lc;
int dims0[2];
// INPUT VARIABLES %%%%%%%%%%%%%%%%%%%%%%%%%
// A is dimensioned LXL
// B is dimensioned LXI
float *A,*B;
// OUTPUT VARIABLE, X=A\B %%%%%%%%%%%%%%%%%%
float *X;
// CUDA/GPU VARIABLES %%%%%%%%%%%%%%%%%%%%%%%%
float *ga, *gb;
int *ipiv;
int info;
if (nrhs != 2) {
mexErrMsgTxt("gpu_sgesv_magma requires 2 input arguments");
} else if (nlhs != 1) {
mexErrMsgTxt("gpu_sgesv_magma requires 1 output argument");
}
if ( !mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1]) ) {
mexErrMsgTxt("Input arrays must be single precision.");
}
// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Single-precision input arrays */
// Dimensions, and then array data
L = mxGetN(prhs[0]);
I = mxGetN(prhs[1]);
printf("L = %i\n",L);
printf("I = %i\n",I);
A = (float*) mxGetData(prhs[0]);
B = (float*) mxGetData(prhs[1]);
// Left hand side matrix set up (the solution)
dims0[0]=L;
dims0[1]=I;
plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
X = (float*) mxGetData(plhs[0]);
// Make modulo 96 dimensions - speeds up the sgemm calculations significantly
// Just used as an example here.
// Ic=I+(96-I%96);
// Lc=L+(96-L%96);
Ic=I;
Lc=L;
// cuInit( 0 );
cublasInit();
cublasAlloc (Lc*Lc, sizeof(float), (void**)&ga);
cudaMemset(ga,0,Lc*Lc*4); /* zero these since we've padded them */
cublasSetMatrix (Lc, Lc, sizeof(float), A, L, ga, Lc);
cublasAlloc (L*I, sizeof(float), (void**)&gb);
cudaMemset(gb,0,L*I*4);
cublasSetMatrix (L, I, sizeof(float), B, L, (void*)gb, Lc);
printf("Set A,B\n");
// Allocate for ipiv - a working matrix used by sgesv, and ignored here.
// ipiv = ( int *) malloc ( sizeof (int) * L ) ;
ipiv = ( int *) mxCalloc (L,sizeof (int));
printf("%i, %i\n",L,Lc);
// Ready to go...
// First numbers L, I pertain only to the non-padded sections of the arrays.
printf("Ready for sgesv...\n");
magma_sgesv_gpu( L, I, ga, Lc, ipiv, gb, Lc, &info);
printf("Done with sgesvs.\n");
// Get the solution off the GPU
cublasGetMatrix (L, I, sizeof(float), gb, Lc, X, L);
// X has the solution we need; now back to matlab after a bit of clean up.
// Print the first three elements of the first row (debugging)
printf("X-top = %e %e %e\n",X[0],X[L],X[L+L]);
// Print the last three elements of the last row (debugging)
printf("X-bottom = %e %e %e\n",X[L*(I-2)-1],X[L*(I-1)-1],X[L*I-1]);
// Clear the variables to avoid GPU memory leak (and GPU crash!)
mxFree(ipiv);
cublasFree (ga);
cublasFree (gb);
cublasShutdown();
}
The idea is that with this mex file compiled, a simple call "[X]= gpu_sgesv_magma(A,B);" in matlab will give the solution X=A\B calculated on the GPU.
While this compiles o.k., alas it causes matlab to crash during the call to the magma_sgesv_gpu routine. I've not been able to get it to work - I've tried various MKL/BLAS/etc. I suspect that the MKL or BLAS calls are the problem - stepping on matlab's routines/memory space, but I really don't know. It would be nice to get MAGMA going in matlab - so this is some feedback and feature request. (This set of procedures worked fine with the more primitive sgemm routine.)
The relevant lines in my Makefile for the matlab routine are:
- Code: Select all
BLASHOME = /opt/intel/Compiler/11.0/074/mkl/lib/em64t/
INCLUDELIB = -L$(CUDAHOME) -L$(MAGMAHOME) -L$(BLASHOME) -lmkl_gf_lp64 -lmkl_intel_thread -lmkl_core -lguide -lmagma -lmagmablas -lcudart -lcuda -lcublas -Wl,-rpath,$(CUDAHOME)
then with
- Code: Select all
export LD_LIBRARY_PATH="/opt/intel/Compiler/11.0/074/mkl/lib/em64t/":$LD_LIBRARY_PATH
set before starting matlab.
Thanks, and thanks for the new release!