I had the standard issue with zdotc,cdotc as described by this thread:
viewtopic.php?f=2&t=278
In this latest magma version, I also had to patch up zlatrd2.cpp and clatrd2.cpp as well for magma to compile with GotoBLAS2. The testing routines for z and c seem to fail - perhaps my patch was wrong; I don't need the complex routines so I am ignoring the issue, being happy that magma compiled.
To use matlab, I had to modify the lines in make.inc to read (the fPIC's are important):
- Code: Select all
OPTS = -O3 -DADD_ -fPIC
...
NVOPTS = --compiler-options -fno-strict-aliasing -DUNIX -O3 -DADD_ -Xcompiler "-fPIC -D_GNU_SOURCE -pthread -fexceptions -m64"
Lastly, I ran into an issue with blocking sizes and the magma_sgesv_gpu call within a mex file. It seems matlab continues to be tempermental with memory issues. I test the mex file by running through a loop of increasing matrix sizes, and the routine crashes for ca. 1200X1000 sizes for A. The nature of the matrix sizes matter - 5000x1000 crashes, 4800x1000 is o.k.... However, by fiddling with the blocking sizes for the matrices within the mex file, the script will then run through o.k. Below is the mex file; the odd trick to get it to work is the line " Lc = ((L+31)/32)*32 +100; " (a dimension padded with zeros to be modulo 32, but then 100 added arbitrarily...trial and error...) This code worked o.k. in magma 1.1; I am not certain how to fix the issue. There is a hint that the issue has to do with sgetrs...
- Code: Select all
#include "mex.h"
#include "cuda.h"
#include "cublas.h"
#include "magma.h"
//#include <omp.h>
#include "sys/time.h"
void cleanup(void) {
mexPrintf("MEX-file is terminating, exiting CUDA Thread\n");
cudaThreadExit();
mexPrintf("Exited CUDA Thread.\n");
}
void magma_sgesv_gpu_( int n, int nrhs, float *dA, int ldda, int *ipiv, float *dB, int lddb, int info);
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int I,L;
int Ic,Lc;
int dims0[2];
// INPUT VARIABLES %%%%%%%%%%%%%%%%%%%%%%%%%
// A is dimensioned LXL
// B is dimensioned LXI
float *A,*B;
// OUTPUT VARIABLE, X=A\B %%%%%%%%%%%%%%%%%%
float *X;
// CUDA/GPU VARIABLES %%%%%%%%%%%%%%%%%%%%%%%%
float *ga, *gb;
int *ipiv;
int info;
if (nrhs != 2) {
mexErrMsgTxt("gpu_sgesv_magma requires 2 input arguments");
} else if (nlhs != 1) {
mexErrMsgTxt("gpu_sgesv_magma requires 1 output argument");
}
if ( !mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1]) ) {
mexErrMsgTxt("Input arrays must be single precision.");
}
// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
// Single-precision input arrays */
// Dimensions, and then array data
L = mxGetN(prhs[0]);
I = mxGetN(prhs[1]);
A = (float*) mxGetData(prhs[0]);
B = (float*) mxGetData(prhs[1]);
// Left hand side matrix set up (the solution)
dims0[0]=L;
dims0[1]=I;
plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
X = (float*) mxGetData(plhs[0]);
// from magma testing sgesv_gpu...
// Lc = ((L+15)/16)*16;
Lc = ((L+31)/32)*32 +100;
// Lc = ((L+255)/256)*256;
// Lc = L;
// Lc = L + (32-L%32);
// Ic = ((I+31)/32)*32;
Ic = I;
// Ic = I + (32-I%32);
// Lc=L;
// Ic=I;
// omp_set_num_threads(4);
cublasInit();
cudaMalloc ((void**)&ga,Lc*Lc*sizeof(float));
cudaMemset(ga,0,Lc*Lc*4); /* zero these since we've padded them */
cublasSetMatrix (L, L, sizeof(float), A, L, (void*)ga, Lc);
cudaMalloc ((void**)&gb,Lc*Ic*sizeof(float));
cudaMemset(gb,0,Lc*Ic*4);
cublasSetMatrix (L, I, sizeof(float), B, L, (void*)gb, Lc);
// Allocate for ipiv - a working matrix used by sgesv, and ignored here.
ipiv = (int *) mxCalloc (L, sizeof(int));
// Ready to go...
// First numbers L, I pertain only to the non-padded sections of the arrays.
// mexPrintf("Pre-Call.\n");
magma_sgesv_gpu( L, I, ga, Lc, ipiv, gb, Lc, &info);
// magma_sgetrf_gpu(L,L,ga,Lc,ipiv, &info);
// mexPrintf("Mid-Call.\n");
// magma_sgetrs_gpu('N',L,I,ga,Lc,ipiv,gb,Lc, &info);
if (info != 0)
printf("magma_sgesv returned error %d.\n", info);
// mexPrintf("Post-Call.\n");
// if (info < 0)
// printf("Argument %d of magma_sgesv had an illegal value.\n", -info);
// Get the solution off the GPU
cublasGetMatrix (L, I, sizeof(float), gb, Lc, X, L);
// X has the solution we need; now back to matlab after a bit of clean up.
// Print the first three elements of the first row (debugging)
// printf("X-top = %e %e %e\n",X[0],X[L],X[L+L]);
// Print the last three elements of the last row (debugging)
// printf("X-bottom = %e %e %e\n",X[L*(I-2)-1],X[L*(I-1)-1],X[L*I-1]);
// Clear the variables to avoid GPU memory leak (and GPU crash!)
mxFree(ipiv);
cudaFree (ga);
cudaFree (gb);
cublasShutdown();
mexAtExit(cleanup);
}
I'm beginning to think it is somewhat masochistic to attempt to use matlab-magma-cuda-mex files, etc... Nice when it works, but what a lot of work to get it to work properly! (Please don't remind me of the commercial packages that I know about...)
