Mathieu,
Correct. I'm not using BLAS, LAPACK or ATLAS anywhere else in this application.
Stuart
Using 3686 of 4096 control points with quality 2.000000 to 0.145472
Setup and solve TPS equations
PLASMA_Init: 0
Just before plasma omp_get_num_threads() yields: 1
PLASMA_dgels: 0
Just after plasma omp_get_num_threads() yields: 1
PLASMA_Finalize: 0
Normalized bending energy: 1.213144e-01
Fit error energy: 2.759992e+00
Interpolate z surface using TPS splines
PLASMA nx: 43 OSthread: 19814 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 86 OSthread: 19815 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 129 OSthread: 19816 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 172 OSthread: 19817 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 215 OSthread: 19818 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 301 OSthread: 19820 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 344 OSthread: 19821 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 387 OSthread: 19822 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 258 OSthread: 19819 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 430 OSthread: 19823 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 473 OSthread: 19824 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 559 OSthread: 19826 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 516 OSthread: 19825 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 645 OSthread: 19828 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 602 OSthread: 19827 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 731 OSthread: 19830 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 688 OSthread: 19829 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 774 OSthread: 19831 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 817 OSthread: 19832 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 860 OSthread: 19833 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 903 OSthread: 19834 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 989 OSthread: 19836 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 946 OSthread: 19835 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 0 OSthread: 19798 OMPthread: 0 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 44 OSthread: 19814 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 689 OSthread: 19829 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 603 OSthread: 19827 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 388 OSthread: 19822 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 474 OSthread: 19824 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 904 OSthread: 19834 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 87 OSthread: 19815 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 302 OSthread: 19820 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 173 OSthread: 19817 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 130 OSthread: 19816 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 775 OSthread: 19831 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 517 OSthread: 19825 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 560 OSthread: 19826 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 646 OSthread: 19828 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 947 OSthread: 19835 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 732 OSthread: 19830 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 259 OSthread: 19819 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 861 OSthread: 19833 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 1 OSthread: 19798 OMPthread: 0 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 431 OSthread: 19823 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 345 OSthread: 19821 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 990 OSthread: 19836 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 216 OSthread: 19818 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 818 OSthread: 19832 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 561 OSthread: 19826 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 647 OSthread: 19828 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 518 OSthread: 19825 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 733 OSthread: 19830 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 260 OSthread: 19819 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 475 OSthread: 19824 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 389 OSthread: 19822 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 346 OSthread: 19821 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 862 OSthread: 19833 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 432 OSthread: 19823 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 690 OSthread: 19829 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 217 OSthread: 19818 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 819 OSthread: 19832 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 604 OSthread: 19827 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 905 OSthread: 19834 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 303 OSthread: 19820 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 131 OSthread: 19816 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 776 OSthread: 19831 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 174 OSthread: 19817 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 948 OSthread: 19835 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 88 OSthread: 19815 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 45 OSthread: 19814 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 2 OSthread: 19798 OMPthread: 0 OMPnumthreads: 24 OMPmaxthreads 24
PLASMA nx: 991 OSthread: 19836 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
... on and on ...
Using 3686 of 4096 control points with quality 2.000000 to 0.145472
Setup and solve TPS equations
Normalized bending energy: 1.213144e-01
Fit error energy: 2.759992e+00
Interpolate z surface using TPS splines
LAPACK nx: 989 OSthread: 19777 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 817 OSthread: 19773 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 602 OSthread: 19768 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 43 OSthread: 19755 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 688 OSthread: 19770 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 645 OSthread: 19769 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 258 OSthread: 19760 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 430 OSthread: 19764 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 860 OSthread: 19774 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 344 OSthread: 19762 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 774 OSthread: 19772 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 86 OSthread: 19756 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 215 OSthread: 19759 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 559 OSthread: 19767 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 516 OSthread: 19766 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 731 OSthread: 19771 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 129 OSthread: 19757 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 387 OSthread: 19763 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 172 OSthread: 19758 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 301 OSthread: 19761 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 903 OSthread: 19775 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 946 OSthread: 19776 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 473 OSthread: 19765 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 0 OSthread: 19745 OMPthread: 0 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 388 OSthread: 19763 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 732 OSthread: 19771 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 259 OSthread: 19760 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 87 OSthread: 19756 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 775 OSthread: 19772 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 345 OSthread: 19762 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 560 OSthread: 19767 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 517 OSthread: 19766 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 818 OSthread: 19773 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 302 OSthread: 19761 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 431 OSthread: 19764 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 603 OSthread: 19768 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 861 OSthread: 19774 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 216 OSthread: 19759 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 689 OSthread: 19770 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 44 OSthread: 19755 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 947 OSthread: 19776 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 173 OSthread: 19758 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 130 OSthread: 19757 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 646 OSthread: 19769 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 990 OSthread: 19777 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 474 OSthread: 19765 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 904 OSthread: 19775 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 389 OSthread: 19763 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 733 OSthread: 19771 OMPthread: 17 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 260 OSthread: 19760 OMPthread: 6 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 303 OSthread: 19761 OMPthread: 7 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 346 OSthread: 19762 OMPthread: 8 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 88 OSthread: 19756 OMPthread: 2 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 432 OSthread: 19764 OMPthread: 10 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 819 OSthread: 19773 OMPthread: 19 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 518 OSthread: 19766 OMPthread: 12 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 561 OSthread: 19767 OMPthread: 13 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 604 OSthread: 19768 OMPthread: 14 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 862 OSthread: 19774 OMPthread: 20 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 217 OSthread: 19759 OMPthread: 5 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 948 OSthread: 19776 OMPthread: 22 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 174 OSthread: 19758 OMPthread: 4 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 390 OSthread: 19763 OMPthread: 9 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 131 OSthread: 19757 OMPthread: 3 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 1 OSthread: 19745 OMPthread: 0 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 45 OSthread: 19755 OMPthread: 1 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 690 OSthread: 19770 OMPthread: 16 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 991 OSthread: 19777 OMPthread: 23 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 647 OSthread: 19769 OMPthread: 15 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 475 OSthread: 19765 OMPthread: 11 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 905 OSthread: 19775 OMPthread: 21 OMPnumthreads: 24 OMPmaxthreads 24
LAPACK nx: 776 OSthread: 19772 OMPthread: 18 OMPnumthreads: 24 OMPmaxthreads 24
... on and on ...
void tpssurf(numpoints,nsampx,nsampy,nsampz,ctlpoints,tpsvec,rescelldted)
int numpoints,nsampx,nsampy,nsampz;
CTLPOINT *ctlpoints;
MUDS_DOUBLE *tpsvec;
MUDS_DOUBLE **rescelldted;
{
/*
Evaluate the thin-plate spline z surface function for all image sample locations.
Could potentially speed this up by using spline to perform SOME
interpolation and zero-padded FFTs to perform the rest.
*/
MUDS_DOUBLE x,y,dxi,dyi,r2i,basis;
int nx,ny,i;
int ompnumthreads,ompmaxthreads,ompthread;
pid_t osthread;
ompmaxthreads=omp_get_max_threads();
/*omp_set_num_threads(ompmaxthreads);*/
#pragma omp parallel for \
private(nx,ny,i,x,y,dxi,dyi,r2i,basis,ompnumthreads,ompthread,osthread) \
shared(nsampx,nsampy,numpoints,tpsvec,rescelldted,ompmaxthreads)
for(nx=0;nx<nsampx;nx++){
ompnumthreads=omp_get_num_threads();
ompthread=omp_get_thread_num();
osthread=(pid_t) syscall(SYS_gettid);
/*osthread=gettid();/*doesn't seem to work/compile*/
fprintf(stderr,"PLASMA nx: %d OSthread: %d OMPthread: %d OMPnumthreads: %d OMPmaxthreads %d\n",n
x,osthread,ompthread,ompnumthreads,ompmaxthreads);
x=(MUDS_DOUBLE)nx/(MUDS_DOUBLE)nsampx;
for(ny=0;ny<nsampy;ny++){
y=(MUDS_DOUBLE)ny/(MUDS_DOUBLE)nsampy;
rescelldted[nx][ny]=(MUDS_DOUBLE)0.;
/*Bending/perturbation/warping part*/
for(i=0;i<numpoints;i++){
dxi=x-ctlpoints[i].x;
dyi=y-ctlpoints[i].y;
r2i=dxi*dxi+dyi*dyi;
if(r2i==(MUDS_DOUBLE)0.) basis=(MUDS_DOUBLE)0.;
else basis=r2i*log(r2i);
rescelldted[nx][ny]+=tpsvec[i]*basis;
}
rescelldted[nx][ny]*=nsampz; /*don't want normalized units*/
/*Bi-linear (affine) part*/
rescelldted[nx][ny]+=(tpsvec[numpoints]+tpsvec[numpoints+1]*x+tpsvec[numpoints+2]*y)*nsampz; /
*don't want normalized units*/
}
}
}
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <plasma.h>
#include <cblas.h>
#include <lapacke.h>
#include <core_blas.h>
#include <omp.h>
#include "muds_types.h"
#include "muds_complex.h"
#include "muds_allocation.h"
void dsolveviaqr_plasma(nthreads,rows,cols,a,rhs,soln)
int nthreads,rows,cols;
MUDS_DOUBLE **a,*rhs,*soln;
{
/*
Interface to FORTRAN LAPACK complex QR decomposition. Assumes full-rank
matrix with rows>=columns.
*/
MUDS_DOUBLE **atrans,**rhstrans,*work;
/*char normal='N'; /*normal, i.e. no transpose*/
int i,j,info,one=1;
extern int PLASMA_dgels();
fprintf(stderr,"Just before PLASMA_Init(%d) omp_get_num_procs() yields: %d\n",nthreads,omp_get_num
_procs());
info=PLASMA_Init(nthreads);
fprintf(stderr,"PLASMA_Init: %d\n",info);
/*Allocate work space*/
matrix(atrans,MUDS_DOUBLE,cols,rows);
matrix(rhstrans,MUDS_DOUBLE,1,rows);
PLASMA_Alloc_Workspace_dgels(rows,cols,&work);
/*Copy A into transposed array for FORTRAN; QR destroys this array*/
for(i=0;i<rows;i++){
for(j=0;j<cols;j++){
atrans[j][i]=a[i][j];
}
}
/*Copy rhs into transposed array for FORTRAN; overwritten by solution*/
for(i=0;i<rows;i++){
rhstrans[0][i]=rhs[i];
}
/*Do it*/
info=PLASMA_dgels(PlasmaNoTrans,rows,cols,one,&atrans[0][0],rows,&work[0],&rhstrans[0][0],rows);
fprintf(stderr,"PLASMA_dgels: %d\n",info);
if(info<0) fprintf (stderr,"QR argument %d bad\n",-info);
/*Recover solution part, which is transposed*/
for(i=0;i<rows;i++){
soln[i]=rhstrans[0][i];
}
/*Deallocate work space*/
freematrix(atrans);
freematrix(rhstrans);
free(work);
info=PLASMA_Finalize();
fprintf(stderr,"PLASMA_Finalize: %d\n",info);
fprintf(stderr,"Just after PLASMA_Finalize() omp_get_num_procs() yields: %d\n",omp_get_num_procs()
);
return;
}
Index: control.c
===================================================================
--- control.c (révision 2590)
+++ control.c (copie de travail)
@@ -342,5 +342,11 @@
plasma_fatal_error("PLASMA_Finalize", "plasma_context_remove() failed");
return status;
}
+
+ /* Restore the concurency */
+ /* actually it's really bad, we should set the concurrency only
+ * if it's not already done and restore it only we had change it */
+ pthread_setconcurrency( 0 );
+
return PLASMA_SUCCESS;
}
Index: control.h
===================================================================
--- control.h (révision 2590)
+++ control.h (copie de travail)
@@ -30,6 +30,7 @@
void plasma_barrier(plasma_context_t *plasma);
void *plasma_parallel_section(void *plasma);
int plasma_setaffinity(int rank);
+int plasma_unsetaffinity();
int plasma_yield();
void plasma_topology_init();
void plasma_topology_finalize();
Index: plasmaos-hwloc.c
===================================================================
--- plasmaos-hwloc.c (révision 2590)
+++ plasmaos-hwloc.c (copie de travail)
@@ -48,6 +48,8 @@
void plasma_topology_finalize(){
+ plasma_unsetaffinity();
+
pthread_mutex_lock(&mutextopo);
plasma_nbr--;
if ((topo_initialized ==1) && (plasma_nbr == 0)) {
@@ -66,7 +68,7 @@
If there are multiple instances of PLASMA then affinity will be wrong: all ranks 0
will be pinned to core 0.
- Also, affinity is not resotred when PLASMA_Finalize() is called.
+ Also, affinity is not restored when PLASMA_Finalize() is called, but is removed.
*/
int plasma_setaffinity(int rank) {
hwloc_obj_t obj; /* Hwloc object */
@@ -117,6 +119,57 @@
return PLASMA_SUCCESS;
}
+/**
+ This routine will unset the affinity set by a previous call to
+ plasma_setaffinity.
+ */
+int plasma_unsetaffinity() {
+ hwloc_obj_t obj; /* Hwloc object */
+ hwloc_cpuset_t cpuset; /* HwLoc cpuset */
+
+ if (!topo_initialized) {
+ plasma_error("plasma_unsetaffinity", "Topology not initialized");
+ return PLASMA_ERR_UNEXPECTED;
+ }
+
+ /* Get last one. */
+ obj = hwloc_get_obj_by_type(plasma_topology, HWLOC_OBJ_MACHINE, 0);
+ if (!obj) {
+ plasma_warning("plasma_unsetaffinity", "Could not get object");
+ return PLASMA_ERR_UNEXPECTED;
+ }
+
+ /* Get a copy of its cpuset that we may modify. */
+ /* Get only one logical processor (in case the core is SMT/hyperthreaded). */
+#if !defined(HAVE_HWLOC_BITMAP)
+ cpuset = hwloc_cpuset_dup(obj->cpuset);
+#else
+ cpuset = hwloc_bitmap_dup(obj->cpuset);
+#endif
+
+ /* And try to bind ourself there. */
+ if (hwloc_set_cpubind(plasma_topology, cpuset, HWLOC_CPUBIND_THREAD)) {
+ char *str = NULL;
+#if !defined(HAVE_HWLOC_BITMAP)
+ hwloc_cpuset_asprintf(&str, obj->cpuset);
+#else
+ hwloc_bitmap_asprintf(&str, obj->cpuset);
+#endif
+ plasma_warning("plasma_unsetaffinity", "Could not bind to the whole machine");
+ printf("Couldn't bind to cpuset %s\n", str);
+ free(str);
+ return PLASMA_ERR_UNEXPECTED;
+ }
+
+ /* Free our cpuset copy */
+#if !defined(HAVE_HWLOC_BITMAP)
+ hwloc_cpuset_free(cpuset);
+#else
+ hwloc_bitmap_free(cpuset);
+#endif
+ return PLASMA_SUCCESS;
+}
+
int plasma_getnuma_size() {
hwloc_cpuset_t cpuset; /* HwLoc cpuset */
hwloc_obj_t obj;
const int numIterPerT=MIN_NUM_ITER; //min. number of iterations per thread (>=200 is reasonable)
const int cpus=omp_get_num_procs();
int numThreads=numIter/numIterPerT>=cpus ? cpus : numIter/numIterPerT;
if(numThreads==0) numThreads=1;
#pragma omp parallel for reduction(+:pot,Ex,Ey,Ez) schedule(static,numIterPerT) num_threads(numThreads)
#pragma omp parallel for reduction(+:pot,Ex,Ey,Ez) schedule(static,numIterPerT) num_threads(4)
Users browsing this forum: No registered users and 2 guests