LAPACK Archives

[Lapack] First singular value is wrong in dgesvd, dgesdd for my data

Hi,

I am writing a f90 program to compute PCA for some genotype data. Matrix
S^2, VT and PC were written to txt file and were compared with R prcomp
result. Surprisingly, the 1st singular value, the 1st singular vector
and the 1st PC were wrong. I was not able to solve the problem. Can
someone please advise me on this? Many thanks.

! Perform PCA analysis using SVD from lapack
! Author: LI Bowen
! Date: 2013-11-16
! Email: bowenli37@Domain.Removed

program pca

use m_option_parser ! load public derived types and procedures 

implicit none

! create allocatable instance of option_t
type(option_t) , allocatable :: program_options(:) 

integer                      :: err 
logical                      :: version , help , use_dgesdd
character(len=len_value)        :: file_name, dim_file_name, S_file_name, 
VT_file_name, PC_file_name !256

integer :: npc, t0, t1, rate, i, j, alloc_error, IERR ! M, #rows of A, N, #cols 
of A
integer :: status, system, nrows, ncols
integer :: N, M, LDA, LDU, LDVT, LWORK, INFO
real(kind=8) :: colavg, sqrtn_1
real(kind=8), dimension(:,:), allocatable :: X, A, U, VT, PC
real(kind=8), dimension(:), allocatable :: S, WORK, rowv
integer, dimension(:), allocatable :: IWORK
character(len=40), dimension(:), allocatable :: SNAME, VNAME

! Externals from the LAPACK library
external dgesdd, dgemm

! set some options for the module: kind of options
call set_parser_options ( with_equal_sign = .true. )

! allocate as much space for this instance of option_t as there are options (6 
in this case)
allocate( program_options(5) ) 

! parse command line arguments
call set_option ( program_options , "--file-name" , "-f" , "data.tg" , "set 
genotype file name" )
call set_option ( program_options , "--version" , "-v" , .false. , "print 
version info to screen" )
call set_option ( program_options , "--help" , "-h" , .false. , "print help 
message and quit" ) 
call set_option ( program_options , "--npc" , "-n" , 20 , "set number of PCs to 
write to output file" ) 
call set_option ( program_options , "--use_dgesdd" , "-d" , .false. , "call 
lapack dgesdd instead of dgesvd" ) 

! check that (short) keys are not set double 
call check_options ( program_options , err )
if (err /= 0) stop 

! now parse the command line by which this program was invoked
call parse_options ( program_options ) 

! attach cmdline/default options to variables 
call get_option_value ( program_options , "-h" , help )
call get_option_value ( program_options , "-v" , version )
call get_option_value ( program_options , "-f" , file_name ) 
call get_option_value ( program_options , '-n' , npc )
call get_option_value ( program_options , '-d' , use_dgesdd )

if ( help ) then ! if help flag was supplied, display a help message and quit
   call print_help_message ( program_options , "PCA" , "0.1" , "LI BOWEN" , 
"2013" )
   stop
end if

! print version info if help was not asked for and quit
if (version .and. .not. help) then
   write(*,"(A3)") "PCA version 0.1"
   stop
end if

! read input file
call system_clock(t0, rate)
dim_file_name = trim(file_name)//'.dim'
! find out the number of rows and cols of input file
status = system( 'wc -l '//file_name//"| cut -f1 -d ' ' > "//dim_file_name  )
if ( status .ne. 0 ) stop 'system: error in finding number of rows of file'
status = system( 'head -1 '//file_name//"| wc | tr -s ' ' | cut -f3 -d ' ' >> 
"//dim_file_name)
if ( status .ne. 0 ) stop 'system: error in finding number of cols of file'
! read dimension of array from the .dim file
open(10, file = dim_file_name, status = 'old', iostat = IERR, action = 'READ')
if(IERR /= 0) stop 'Cannno open .dim file'
read (10,*) nrows
read (10,*) ncols
close(10)

! read input file into array
N = nrows - 1                   ! A is MxN matrix
M = ncols - 1
! SNAME
allocate(SNAME(M), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array SNAME"
! VNAME
allocate(VNAME(N), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array VNAME"
! X
allocate(X(N, M), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array X"
! A
allocate(A(M, N), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array A"
! U
allocate(U(M, min(N,M)), stat = alloc_error )
if(alloc_error /= 0) stop "Insufficient memory for array U"
! S
allocate(S(min(N, M)), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for singular value array S"
! VT
allocate(VT(min(N,M), N), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array VT"
! LWORK & WORK & IWORK
LWORK = 3*min(M,N) + max(max(M,N),5*min(M,N)*min(M,N)+4*min(M,N))
allocate(WORK(LWORK), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array WORK"
allocate(IWORK(8*min(M,N)), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array IWORK"
! PCA
allocate(PC(npc, M), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array PC"

open(20, file = file_name, status = 'old', iostat = IERR, action = 'READ')
if(IERR /= 0) stop 'Cannno open input file'
read(20, *) VNAME(1), SNAME
allocate(rowv(M), stat = alloc_error)
if(alloc_error /= 0) stop "Insufficient memory for array rowv"
sqrtn_1 = sqrt(N - 1.)
do i=1,N
   read(20, *) VNAME(i), rowv
   X(i , :) = rowv
   ! zero mean and divide by sqrt(n)
   ! colsum = sum(A(:,i))
   ! A(:,i) = (A(:,i) - colsum/M) / sqrt(real(M))
   colavg = sum(rowv)/N
   A(:, i) = (rowv - colavg) / sqrtn_1
enddo
close(20)
deallocate(rowv)
call system_clock(t1)
print *, "Finished reading input file. Elapsed time: ", real(t1 - t0) / 
real(rate)

call system_clock(t0, rate)
! CALL LAPACK dgesdd
if (M >= N) then
   LDVT = N
else                            ! M < N
   LDVT = M
endif
LDA = M
LDU = M
if(use_dgesdd) then
   call dgesdd('S', M, N, A, LDA, S, U, LDU, VT, LDVT, WORK, LWORK, IWORK, INFO)
else
   call dgesvd('S', 'S', M, N, A, LDA, S, U, LDU, VT, LDVT, WORK, LWORK, INFO)
endif
call system_clock(t1)
print *, "Finished SVD computation. Elapsed time: ", real(t1 - t0) / real(rate)

if(INFO == 0) then
   print *, "Success"
   call system_clock(t0, rate)
   S_file_name = trim(file_name) // '.eig'
   VT_file_name = trim(file_name) // '.vt'
   PC_file_name = trim(file_name) // '.pc'

   if( npc > min(M, N)) then
      print *, "npc is larger than min(M,N), reset"
      npc = min(M, N)
   endif
   ! Write .eig
   open(30, file = S_file_name, status = 'replace', iostat = IERR, action = 
'WRITE')
   if( IERR /= 0 ) stop "Unable to write to .eig file"
   write(30, 100) S(1:npc)*S(1:npc)
100 format( 20000000 (E15.6E3) )
   close(30)
   ! write .vt; eigen vector, loading
   open(40, file = VT_file_name, status = 'replace', iostat = IERR, action = 
'WRITE')
   if( IERR /= 0 ) stop 'Unable to write to .vt file'
    do i=1,npc
      write(40, 100) (VT(i, j), j = 1, N)
   enddo
   close(40)
   ! write .pc; PC
   open(50, file = PC_file_name, status = 'replace', iostat = IERR, action = 
'WRITE')
   if( IERR /= 0 ) stop 'Unable to write to .pc file'
   PC = matmul(VT(1:npc,:), X)
   do i=1,M
      write(50, 100) (PC(j, i), j = 1, npc)
   enddo
   close(50)
   call system_clock(t1)
   print *, "Finished writing to output file. Elapsed time: ", real(t1 - t0) / 
real(rate)
else if(INFO < 0) then
   print *, 'Failure in DGESDD. INFO = ', INFO, '. The ', -INFO, 'th argument 
had an illegal value.'
else
   print *, 'Failure in DGESDD. INFO > 0:  DBDSDC did not converge, updating 
process failed.'
endif

deallocate(A)
deallocate(X)
deallocate(SNAME)
deallocate(VNAME)
deallocate(VT)
deallocate(S)
deallocate(U)

end program pca


-- 
Sincerely,
LI Bowen (Mr)

<Prev in Thread] Current Thread [Next in Thread>
  • [Lapack] First singular value is wrong in dgesvd, dgesdd for my data, Li Bowen <=


For additional information you may use the LAPACK/ScaLAPACK Forum.
Or one of the mailing lists, or