{
if (nrgpu==1){
dlamda, q2, indx, ctot, w, s, indxq,
*
dwork, range, vl, vu, il, iu, info );
}
float d_one = 1.;
float d_zero = 0.;
char range_[] = {range, 0};
float temp;
*info = 0;
if(k < 0)
*info=-1;
else if(n < k)
*info=-2;
*info=-6;
else if (! (alleig || valeig || indeig))
*info = -15;
else {
if (valeig) {
if (n > 0 && vu <= vl)
*info = -17;
}
else if (indeig) {
if (il < 1 || il >
max(1,n))
*info = -18;
else if (iu <
min(n,il) || iu > n)
*info = -19;
}
}
if(*info != 0){
}
if(k == 0)
#ifdef CHECK_CPU
#endif
n2 = n - n1;
n12 = ctot[0] + ctot[1];
n23 = ctot[1] + ctot[2];
iq2 = n1 * n12;
lq2 = iq2 + n2 * n23;
n1_loc = (n1-1) / (nrgpu/2) + 1;
n2_loc = (n2-1) / (nrgpu/2) + 1;
for (igpu = 0; igpu < nrgpu; ++igpu){
#ifdef CHECK_CPU
#endif
}
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
ni_loc[igpu] =
min(n1_loc, n1 - igpu/2 * n1_loc);
#ifdef CHECK_CPU
lapackf77_slacpy(
"A", &ni_loc[igpu], &n12, q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc);
#endif
q2+n1_loc*(igpu/2), n1,
dQ2(igpu), n1_loc, stream[igpu][0] );
ni_loc[igpu+1] =
min(n2_loc, n2 - igpu/2 * n2_loc);
#ifdef CHECK_CPU
lapackf77_slacpy(
"A", &ni_loc[igpu+1], &n23, q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc);
#endif
q2+iq2+n2_loc*(igpu/2), n2,
dQ2(igpu+1), n2_loc, stream[igpu+1][0] );
}
}
#ifdef ENABLE_TIMER
#endif
for(i = 0; i < k; ++i)
#ifdef ENABLE_TIMER
#endif
#pragma omp parallel for
for(j = 0; j < k; ++j){
if(iinfo != 0)
*info=iinfo;
}
if(*info != 0)
#ifdef ENABLE_TIMER
#endif
if (valeig)
else if (indeig)
dirange(k, indxq, &iil, &iiu, il, iu);
else {
iil = 1;
iiu = k;
}
rk = iiu - iil + 1;
if (k == 2){
for(j = 0; j < k; ++j){
i = indx[0] - 1;
i = indx[1] - 1;
}
}
else if(k != 1){
tmp = ldq + 1;
#ifdef ENABLE_TIMER
#endif
#pragma omp parallel for
if(ii != jj)
w[ii] = w[ii] * ( *
Q(ii, jj) / ( dlamda[ii] - dlamda[jj] ) );
}
#ifdef ENABLE_TIMER
printf(
"for j for i divided in two parts = %6.2f\n",
GetTimerValue(start,end)/1000.);
#endif
for(i = 0; i < k; ++i)
w[i] = copysign( sqrt( -w[i] ), s[i]);
#ifdef ENABLE_TIMER
#endif
if (k > 256)
for(j = iil-1; j < iiu; ++j){
#pragma omp parallel for
s[ii] = w[ii] / *
Q(ii,j);
#pragma omp parallel for
*
Q(ii,j) = s[iii] / temp;
}
}
else
for(j = iil-1; j < iiu; ++j){
for(i = 0; i < k; ++i)
for(i = 0; i < k; ++i){
}
}
#ifdef ENABLE_TIMER
printf(
"for j (2*for i) = %6.2f\n",
GetTimerValue(start,end)/1000.);
#endif
}
#ifdef ENABLE_TIMER
#endif
if(rk > 0){
if( n23 != 0 ){
s, &n23, &d_zero,
Q(n1,iil-1), &ldq );
}
else
if( n12 != 0 ) {
s, &n12, &d_zero,
Q(0,iil-1), &ldq);
}
else
}
else {
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
if (n23 != 0) {
dS(igpu+1,0), n23, stream[igpu+1][0] );
}
if (n12 != 0) {
dS(igpu,0), n12, stream[igpu][0] );
}
}
for (i = 0; i<rk; i+=nb){
ind = (i/nb)%2;
if (i+nb<rk){
ib2 =
min(nb, rk - i - nb);
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
if (n23 != 0) {
Q(ctot[0],iil-1+i+nb), ldq,
dS(igpu+1,(ind+1)%2), n23, stream[igpu+1][(ind+1)%2] );
}
if (n12 != 0) {
dS(igpu,(ind+1)%2), n12, stream[igpu][(ind+1)%2] );
}
}
}
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
if (n23 != 0) {
#ifdef CHECK_CPU
#endif
}
if (n12 != 0) {
#ifdef CHECK_CPU
#endif
}
}
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
if (n23 != 0) {
#ifdef CHECK_CPU
blasf77_sgemm(
"N",
"N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc,
hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc);
#endif
dS(igpu+1, ind), n23, d_zero,
dQ(igpu+1, ind), n2_loc);
#ifdef CHECK_CPU
printf(
"norm Q %d: %f\n", igpu+1,
cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc,
dQ(igpu+1, ind), n2_loc));
#endif
}
if (n12 != 0) {
#ifdef CHECK_CPU
blasf77_sgemm(
"N",
"N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc,
hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc);
#endif
dS(igpu, ind), n12, d_zero,
dQ(igpu, ind), n1_loc);
#ifdef CHECK_CPU
printf(
"norm Q %d: %f\n", igpu,
cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc,
dQ(igpu, ind), n1_loc));
#endif
}
}
for (igpu = 0; igpu < nrgpu-1; igpu += 2){
if (n23 != 0) {
Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, stream[igpu+1][ind] );
}
if (n12 != 0) {
Q(n1_loc*(igpu/2),iil-1+i), ldq, stream[igpu][ind] );
}
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
#ifdef CHECK_CPU
#endif
}
if( n23 == 0 )
if( n12 == 0 )
}
}
#ifdef ENABLE_TIMER
#endif
}