I wanted to use MAGMA to speed up a sparse preconditioned conjugate gradient solver. However, my first tries result in a slower solve than the CPU-Version using Eigen. Probably, I am doing something wrong and have forgotten to set a flag or sth like that.
Here is my test function:
Code: Select all
#include <cuda_runtime.h>
#include <chrono>
#include <vector>
#include <iostream>
#include </magma/include/magma_v2.h>
#include </magma/include/magmasparse.h>
#include <Eigen/Dense>
#include <Eigen/Sparse>
#define TIMING
#ifdef TIMING
#define INIT_TIMER auto start = std::chrono::high_resolution_clock::now();
#define START_TIMER start = std::chrono::high_resolution_clock::now();
#define STOP_TIMER(name) std::cout << "RUNTIME of " << name << ": " << \
std::chrono::duration<double, std::milli>( \
std::chrono::high_resolution_clock::now()-start \
).count() << " ms " << std::endl;
#else
#define INIT_TIMER
#define START_TIMER
#define STOP_TIMER(name)
#endif
void PCGMagma()
{
// Initialize MAGMA and create some LA structures.
magma_init();
magma_sopts opts;
magma_queue_t queue;
magma_queue_create( 0, &queue );
magma_s_matrix A={Magma_CSR}, dA={Magma_CSR};
magma_s_csr_mtx(&A, "Amatrix.mtx", queue);
int i, m=A.num_rows, n=1, nnz = A.nnz;
//init Eigen Matrix
Eigen::SparseMatrix<float> S;
S.resize(m,m);
std::vector<Eigen::Triplet<float>> triplets;
for(int j = 0; j < m; j++)
{
int start = A.row[j];
int end = A.row[j + 1];
for(int k = start; k < end; k++)
{
triplets.push_back(Eigen::Triplet<float>(j,A.col[k],A.val[k]));
}
}
S.setFromTriplets(triplets.begin(), triplets.end());
// init Eigen solution
Eigen::VectorXf _sol(m);
_sol.setConstant(0.1);
// init Eigen rhs from solution
Eigen::VectorXf _rhs(m);
_rhs = S*_sol;
Eigen::VectorXf _x(m);
_x.setZero();
Eigen::ConjugateGradient<Eigen::SparseMatrix<float>, Eigen::Lower|Eigen::Upper, Eigen::DiagonalPreconditioner<float>> cg;
cg.setMaxIterations(30);
cg.setTolerance(1e-12);
INIT_TIMER
START_TIMER
cg.compute(S);
_x = cg.solve(_rhs);
//t.stop();
STOP_TIMER("Eigen PCG")
std::cout << "iterations: " << cg.iterations() << " res: " << cg.error() << std::endl;
std::cout << "difference to solution: " << (_x - _sol).norm() << std::endl;
float *sol = (float*) calloc(m, sizeof(float));
float* rhs = (float*) calloc(m, sizeof(float));
for(int i = 0; i < m; i++)
{
rhs[i] = _rhs(i);
sol[i] = 0.0;
}
magma_s_matrix b={Magma_DENSE}, db={Magma_DENSE};
magma_s_matrix x={Magma_CSR}, dx={Magma_CSR};
// Pass the rhs to MAGMA.
magma_svset( m, 1, rhs, &b, queue );
// Choose a solver, preconditioner, etc. - see documentation for options.
opts.solver_par.solver = Magma_PCGMERGE;
opts.solver_par.maxiter = 30;
opts.solver_par.rtol = 1e-12;
opts.precond_par.solver = Magma_JACOBI;
opts.precond_par.levels = 0;
opts.precond_par.trisolver = Magma_CUSOLVE;
// Initialize the solver.
magma_ssolverinfo_init( &opts.solver_par, &opts.precond_par, queue );
// Copy the system to the device
magma_smtransfer( A, &dA, Magma_CPU, Magma_DEV, queue );
magma_smtransfer( b, &db, Magma_CPU, Magma_DEV, queue );
// initialize an initial guess for the iteration vector
magma_svinit( &dx, Magma_DEV, b.num_rows, b.num_cols, 0.0, queue );
// Generate the preconditioner.
magma_s_precondsetup( dA, db, &opts.solver_par, &opts.precond_par, queue );
cudaDeviceSynchronize();
START_TIMER
// If we want to solve the problem, run:
magma_s_solver( dA, db, &dx, &opts, queue );
cudaDeviceSynchronize();
STOP_TIMER("MAGMA PCG")
// Then copy the solution back to the host...
magma_smtransfer( dx, &x, Magma_DEV, Magma_CPU, queue );
// and back to the application code
magma_svcopy( x, &m, &n, sol, queue );
std::cout << "iterations: " << opts.solver_par.numiter << " res: " << opts.solver_par.iter_res << std::endl;
// Free the allocated memory...
magma_smfree( &dx, queue );
magma_smfree( &db, queue );
magma_smfree( &dA, queue );
magma_smfree( &b, queue ); // won't do anything as MAGMA does not own the data.
magma_smfree( &A, queue ); // won't do anything as MAGMA does not own the data.
// and finalize MAGMA.
magma_queue_destroy( queue );
magma_finalize();
MartinVec::VecX x0(m);
for (i = 0; i < m; ++i) {
x0(i) = sol[i];
}
std::cout << "difference to solution: " << (x0 - _sol).norm() << std::endl;
free(sol);
free(rhs);
}
I tried to use no other 3rd party libraries than Eigen and Magma to make it easier to run this code. I used Magma 2.4.0 and the most recent Eigen version on Github (https://github.com/eigenteam/eigen-git-mirror). My GPU is a GTX 980 and my CPU an Intel Xeon CPU E5-1620.
I attached the Matrix I use. It is a symmetric 4108x4108 sparse matrix with 33070 non zeros (4-18 per row).
The console output is:
Code: Select all
% Reading sparse matrix from file (Amatrix.mtx): done. Converting to CSR: done.
RUNTIME of Eigen PCG: 1.25956 ms
iterations: 30 res: 0.0015823
difference to solution: 0.0125479
RUNTIME of MAGMA PCG: 2.9871 ms
iterations: 30 res: 2.38373
difference to solution: 0.0113944
Is there anything I am doing wrong?
I also tested the testing_ssolver example with the following options:
Code: Select all
./testing_ssolver --solver PCG --rtol 4e-5 --maxiter 30 --precond JACOBI --format CSR Amatrix.mtx
I tried different matrix-formats but only with minor performance improvements.
I hope that there is a possibility to get a faster version than the eigen one. I hope, one of you can help me.
Thanks in advance
Martin