I've finished writing a subroutine to solve the problem P^i.U^(i-!)+Q^i.U^(i)+S^i.U^(i+!)=0 (where P,Q and S are NxN Dcomplex arrays and U is an N Dcomplex vector and i refers to the current mesh position). My routine makes a lot of usage of ZGESV in order to solve this problem. In one limit P,Q and S are diagonal in which case one call to my subroutine takes approximately 1 minute (for N=101) however in most cases P,Q and S will be tridiagonal and in this case it seems that my subroutine now takes around 33 minutes instead. I was wondering if anyone knew if this is to be expected, and ways to get less of a performance hit in the tridiagonal case? I've included a sample portion of the subroutine below if it helps.
- Code: Select all
!Initialise beta
DO i=1,nrows
beta(i,i,istrt)=CMPLX(1.0d0,0.0d0)
END DO
!Iterate from istrt to max
DO i=istrt+1,nx
!Load matrices
CALL LOAD_MATS(i)
!Get alphas
!Form temporary using BLAS matrix multiplication
!CALL ZGEMM('N','N',nrows,nrows,nrows,CMPLX(1.d0,0.d0),p_mat,nrows,alpha(:,:,i-1),nrows,CMPLX(0.d0,0.d0),temp,nrows)
temp2=alpha(:,:,i-1)
temp=MATMUL(p_mat,temp2)
!Update temporary by adding q_mat
temp=q_mat+temp
!Update second temporary
temp2=-s_mat
!Solve for alpha(:,:,i) -> Stored in temp2
CALL ZGESV(nrows,nrows,temp,nrows,ipiv1,temp2,nrows,info)
!Fill alpha segment
alpha(:,:,i)=temp2
!Now get Betas
!Loop over initial conditions
DO j=1,nrows
!Form temporary using BLAS matrix multiplication
!CALL ZGEMV('N',nrows,nrows,1.0,-p_mat,nrows,beta(:,j,i-1),1,0,vec_temp,1)
!CALL ZGEMM('N','N',nrows,1,nrows,1.0,-p_mat,nrows,beta(:,j,i-1),nrows,0,vec_temp,nrows)
vec_temp(:,1)=MATMUL(-p_mat,beta(:,j,i-1))
!Solve for beta(:,j,i) -> Stored in vec_temp
CALL ZGESV(nrows,1,temp,nrows,ipiv1,vec_temp(:,1),nrows,info)
!Fill beta segment
beta(:,j,i)=vec_temp(:,1)
END DO
END DO