I am a graduate student doing my independent study which happened to be in a subject i am not that good at ( linear algebra). So, what we do in our group is test several of the magma library kernels for research projects in formal verification. In order to do so, we need to find the correct arguments values for each kernel we formally verify. It takes too long to figure that out (especially for me). Can any one help me figure out correct arguments values that I can plug in "magmablas_chemv_200_L_special" and possible other kernels in the same file "magma-1.2.0/magmablas/chemv_fermi.cu" ?
I spent more than 3 days and haven't yet figured out the correct parameters.
I am pasting the code I am trying to make it work to start with. When I paste the code, you will know that I am trying to "completely" isolate each kernel to instrument the code later on after making it work first on nvcc ...etc. This is why you will see me commenting all includes and stripping all needed code from them to a single file.
- Code: Select all
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
/* #include <cuda_runtime_api.h> */
/* #include <cublas.h> */
/* #include <cblas.h> */
// includes, project
/* #include "flops.h" */
/* #include "magma.h" */
/* #include "magma_lapack.h" */
// #include "testings.h"
// #include "klee.h"
// #include "cutil.h"
//===========================================================================
// Defines
//===========================================================================
typedef float2 cuFloatComplex;
typedef int magma_int_t;
//---------------------------
// make_cuFloatComplex
//---------------------------
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
/*************************************************************
* cuFloatComplex
*/
// __host__ __device__ static __inline__ cuFloatComplex
// operator-(const cuFloatComplex &a)
// {
// return make_cuFloatComplex(-a.x, -a.y);
// }
__host__ __device__ static __inline__ cuFloatComplex
operator+(const cuFloatComplex a, const cuFloatComplex b)
{
return make_cuFloatComplex(a.x + b.x, a.y + b.y);
}
__host__ __device__ static __inline__ void
operator+=(cuFloatComplex &a, const cuFloatComplex b)
{
a.x += b.x; a.y += b.y;
}
// __host__ __device__ static __inline__ cuFloatComplex
// operator-(const cuFloatComplex a, const cuFloatComplex b)
// {
// return make_cuFloatComplex(a.x - b.x, a.y - b.y);
// }
// __host__ __device__ static __inline__ void
// operator-=(cuFloatComplex &a, const cuFloatComplex b)
// {
// a.x -= b.x; a.y -= b.y;
// }
__host__ __device__ static __inline__ cuFloatComplex
operator*(const cuFloatComplex a, const cuFloatComplex b)
{
return make_cuFloatComplex(a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y);
}
// __host__ __device__ static __inline__ cuFloatComplex
// operator*(const cuFloatComplex a, const float s)
// {
// return make_cuFloatComplex(a.x * s, a.y * s);
// }
// __host__ __device__ static __inline__ cuFloatComplex
// operator*(const float s, const cuFloatComplex a)
// {
// return make_cuFloatComplex(a.x * s, a.y * s);
// }
// __host__ __device__ static __inline__ void
// operator*=(cuFloatComplex &a, const cuFloatComplex b)
// {
// float tmp = a.y * b.x + a.x * b.y;
// a.x = a.x * b.x - a.y * b.y;
// a.y = tmp;
// }
// __host__ __device__ static __inline__ void
// operator*=(cuFloatComplex &a, const float s)
// {
// a.x *= s; a.y *= s;
// }
//---------------------------
// Lots of cuda stuff
//---------------------------
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
//----------------------------------------------------
// Utility custom functions to help generate the data
//----------------------------------------------------
void fillVector(cuFloatComplex * x, magma_int_t n, cuFloatComplex aValue)
{
/**
Author: ...
Fills a complex numbers vector with 'aValue'.
This function is modifiable so that it fills random
complex numbers instead of just a single value. However,
since this function is just for generating regular tests,
that feature was left out.
*/
int i;
for (i = 0; i < n; i++)
{
// TODO fill in this part
x[i]= aValue;
}
}
void fillHermitianMatrix(cuFloatComplex * A, magma_int_t n, cuFloatComplex diagScalar)
{
/**
Author: ...
Fills a square matrix by regular-psuedo-random complex numbers making sure the hermetian matrix is symmetrix
( i.e. A* = conj(transpose(A)) and filling the diagonal with 'diagScalar'.
*/
int i,j; // i is the row, j is the column, but MAGMA is COLUMN major
cuFloatComplex temp;
for (j = 0; j < n; j++)
{
for (i = 0; i < n; i++)
{
// TODO fill in this part
/*Steps:
1 - Diagonals: j == i
2 - Remaining Elements ( symmetric): '
2.1- first - fill in A[j][i] with cuFloatComplex number
2.2- second - fill in A[j][i] with cuFloatComplex number
3 - do this till i<n and j<n
*/
// 1
if(i == j)
{
A[i*n+j] = diagScalar;
}
// 2
else
{
temp = make_cuFloatComplex((float)j,(float)i); // one number for every symmetric elems wrt diagonal
//2.1
A[i*n+j] = temp; // vertical
//2.2
A[j*n+i] = temp; // horizontal
}
}//3
}//3
}
void printMatrix(cuFloatComplex * matrix,int degree)
{ /**
Only assuming square matrices
*/
int i = 0;
int j = 0;
for(i = 0 ; i < degree; i++)
{
for(j = 0; j < degree; j++)
{
printf("(%3.3f, %3.3f i) \t", matrix[i * degree + j].x, matrix[i * degree + j].y);
}
printf("\n.");
}
}
//----------------------------
// Lots of Defines
//----------------------------
//#define make_FloatingPoint(x, y) make_cuFloatComplex(x, y);
#define MAGMA_C_ZERO make_cuFloatComplex(0.0, 0.0)
#define MAGMA_C_CNJG(a) cuConjf(a)
#define MAGMA_C_SET2REAL(v, t) {(v).x = (t); (v).y = 0.0;}
#define PRECISION_c
/*The version for tesla can be found in chemv_tesla.cu */
//#if (GPUSHMEM >= 200)
#define magmablas_chemv_200 magmablas_chemv
#define magmablas_chemv2_200 magmablas_chemv2
#define NB_64
/*
turning on NB_64, it will call routine blocksize = 64
otherwise it will can blocksize = 32 which is 10% faster in z,c precision
*/
//#ifdef NB_64// using block size 64
#define chemv_bs 64
#define thread_x 64
#define thread_y 4
#define bank_shift 33
#define quarter_thread_x 16
#define half_thread_x 32
// #else // using block size 32
// #define chemv_bs 32
// #define thread_x 32
// #define thread_y 8
// #define bank_shift 33
// #define SWITCH 1400
// #ifdef NB_64
// #endif
// #endif
// #endif
//===========================================================================
// the kernel
//===========================================================================
__global__ void
magmablas_chemv_200_L_special( magma_int_t n, cuFloatComplex alpha,
cuFloatComplex *A, magma_int_t lda,
cuFloatComplex *x, magma_int_t incx,
cuFloatComplex beta,
cuFloatComplex *y, magma_int_t incy,
cuFloatComplex *WC)
{
//............ the code of the kernel is included already in the MAGMA library, this is why I omitted it.
}
//===========================================================================
// Main Template
//===========================================================================
int main( int argc, char** argv)
{
/*************************************************************************
Purpose
=======
magmablas_chemv performs the matrix-vector operation on fermi:
y := alpha*A*x + beta*y,
where alpha and beta are scalars, x and y are n element vectors and
A is an n by n hermitian matrix.
Arguments
==========
UPLO - CHARACTER*1.
On entry, UPLO specifies whether the upper or lower
triangular part of the array A is to be referenced as
follows:
UPLO = 'U' or 'u' Only the upper triangular part of A
is to be referenced.
UPLO = 'L' or 'l' Only the lower triangular part of A
is to be referenced.
Unchanged on exit.
N - INTEGER.
On entry, N specifies the order of the matrix A.
N must be at least zero.
Unchanged on exit.
ALPHA - COMPLEX*16 .
On entry, ALPHA specifies the scalar alpha.
Unchanged on exit.
A - COMPLEX*16 array of DIMENSION ( LDA, n ).
Before entry with UPLO = 'U' or 'u', the leading n by n
upper triangular part of the array A must contain the upper
triangular part of the hermitian matrix and the strictly
lower triangular part of A is not referenced.
Before entry with UPLO = 'L' or 'l', the leading n by n
lower triangular part of the array A must contain the lower
triangular part of the hermitian matrix and the strictly
upper triangular part of A is not referenced.
Note that the imaginary parts of the diagonal elements need
not be set and are assumed to be zero.
Unchanged on exit.
LDA - INTEGER.
On entry, LDA specifies the first dimension of A as declared
in the calling (sub) program. LDA must be at least
max( 1, n ).
Unchanged on exit.
It is recommended that lda is multiple of 16. Otherwise
performance would be deteriorated as the memory accesses
would not be fully coalescent.
X - COMPLEX*16 array of dimension at least
( 1 + ( n - 1 )*abs( INCX ) ).
Before entry, the incremented array X must contain the n
element vector x.
Unchanged on exit.
INCX - INTEGER.
On entry, INCX specifies the increment for the elements of
X. INCX must not be zero.
Unchanged on exit.
BETA - COMPLEX*16 .
On entry, BETA specifies the scalar beta. When BETA is
supplied as zero then Y need not be set on input.
Unchanged on exit.
Y - COMPLEX*16 array of dimension at least
( 1 + ( n - 1 )*abs( INCY ) ).
Before entry, the incremented array Y must contain the n
element vector y. On exit, Y is overwritten by the updated
vector y.
INCY - INTEGER.
On entry, INCY specifies the increment for the elements of
Y. INCY must not be zero.
Unchanged on exit.
// */
//------------------- Constants ---------------------
int DEGREE_A = 16;
int SIZE_A = DEGREE_A * DEGREE_A;
int SIZE_XY = DEGREE_A;
long MEM_SIZE_CUFLOATCOMPLEX = sizeof(cuFloatComplex);
long MEM_SIZE_A = sizeof(cuFloatComplex) * SIZE_A;
long MEM_SIZE_XY = sizeof(cuFloatComplex)* SIZE_XY;
int numthreads = SIZE_A;
int numblocks = 1;//SIZE_A/numthreads;
if(!numblocks)
numblocks++;
//---------------------------------------------------
magma_int_t n = DEGREE_A; /* so that the matrix is 16 by 16 since 'n' is the degree
Since the recommendation in the documentation says that LDA should be
a multiple of 16 for memory coallescing.*/
// unchanged on exit
cuFloatComplex *alpha = (cuFloatComplex *) malloc(MEM_SIZE_CUFLOATCOMPLEX);
*alpha = make_cuFloatComplex(2,0); // just a scalar 2
//device
cuFloatComplex *d_alpha;
cudaMalloc(&d_alpha,MEM_SIZE_CUFLOATCOMPLEX);
cudaMemcpy(d_alpha,alpha,MEM_SIZE_CUFLOATCOMPLEX,cudaMemcpyHostToDevice);
// changed on exit, need copy back to host
cuFloatComplex *A; // A is an 'n' by 'n' hermitian matrix as a 1D array of (LDA,n) dimensions
A = (cuFloatComplex *) malloc(MEM_SIZE_A);
fillHermitianMatrix(A, DEGREE_A ,make_cuFloatComplex(1,0)); // n is degree for an n by n matrix, diagonal is 1's
//device
cuFloatComplex * d_A;
cudaMalloc(&d_A, MEM_SIZE_A); // causes memory curruption, why ?
cudaMemcpy(d_A, A, MEM_SIZE_A,cudaMemcpyHostToDevice);
//---------------------> debugging only
printMatrix(A, DEGREE_A);
magma_int_t lda = DEGREE_A; // number of rows/columns
// unchanged on exit
cuFloatComplex *x; // x is vector of 'n' elems; has to be of size at least ( 1 + ( n - 1 )*abs( INCX ) ).
x = (cuFloatComplex *) malloc(MEM_SIZE_XY);
fillVector(x,n,make_cuFloatComplex(2,0));
//device
cuFloatComplex *d_x;
cudaMalloc(&d_x, MEM_SIZE_XY);
cudaMemcpy(d_x, x, MEM_SIZE_XY, cudaMemcpyHostToDevice);
magma_int_t incx = 1; // increment for x; >0
// unchanged on exit
cuFloatComplex * beta; // scalar
beta = (cuFloatComplex *) malloc(MEM_SIZE_CUFLOATCOMPLEX);
*beta = make_cuFloatComplex(1,0);
//device
cuFloatComplex * d_beta;
cudaMalloc(&d_beta, MEM_SIZE_CUFLOATCOMPLEX);
cudaMemcpy(d_beta, beta, MEM_SIZE_CUFLOATCOMPLEX, cudaMemcpyHostToDevice);
// unchanged on exit
cuFloatComplex *y; // y is vector of 'n' elems; has to be of size at least ( 1 + ( n - 1 )*abs( INCY ) ).
y = (cuFloatComplex *) malloc(MEM_SIZE_XY);
fillVector(y,n,make_cuFloatComplex(2,0));
//device
cuFloatComplex * d_y;
cudaMalloc(&d_y, MEM_SIZE_XY);
cudaMemcpy(d_y, y, MEM_SIZE_XY, cudaMemcpyHostToDevice);
magma_int_t incy = 1; // increment for y; >0
// unchanged on exit ( maybe)
cuFloatComplex * temp;
temp = (cuFloatComplex *) malloc(MEM_SIZE_CUFLOATCOMPLEX);
*temp = make_cuFloatComplex(1,0);
cuFloatComplex *WC = temp;
cuFloatComplex * d_WC;
cudaMalloc(&d_WC, MEM_SIZE_CUFLOATCOMPLEX);
cudaMemcpy(d_WC, WC, MEM_SIZE_CUFLOATCOMPLEX, cudaMemcpyHostToDevice);
//-----------------------------
// Compute
//-----------------------------
dim3 bs(numthreads);
dim3 gs(numblocks);
=>magmablas_chemv_200_L_special<<<gs,bs>>>(
n,
*d_alpha,
d_A,
lda,
d_x,
incx,
*d_beta,
d_y,
incy,
d_WC
);
// magmablas_chemv_200_L_special<<<gs,bs>>>( magma_int_t n,
// cuFloatComplex alpha,
// cuFloatComplex *A,
// magma_int_t lda,
// cuFloatComplex *x,
// magma_int_t incx,
// cuFloatComplex beta,
// cuFloatComplex *y,
// magma_int_t incy,
// cuFloatComplex *WC);
//-----------------------------
// Copy back results
//-----------------------------
cudaMemcpy(A, d_A, MEM_SIZE_A, cudaMemcpyDeviceToHost);
//---------------------> debugging only
printMatrix(A, DEGREE_A);
// free cuda mem
cudaFree(d_alpha);
cudaFree(d_A);
cudaFree(d_x);
cudaFree(d_beta);
cudaFree(d_y);
cudaFree(d_WC);
//-----------------------------
// Display data
//-----------------------------
// free host mem
free(alpha);
free(A);
free(x);
free(beta);
free(y);
free(WC);
return EXIT_SUCCESS;
}