Error in device interface + pinned memory + Fortran

Open discussion for MAGMA

Error in device interface + pinned memory + Fortran

Postby jah87 » Thu May 30, 2013 7:24 pm

I'm trying to utilize Hyper-Q on the GK110 by launching concurrent kernels from multiple CPU OpenMP threads. Each thread is performing a predefined ODE using Newton-Raphson, so I'm calling DGETRF and DGETRS for each timestep iteration for each thread.

I get errors at run time indicating a problem with the mapping of the CPU memory to the GPU:
Code: Select all
...
CUBLAS error: memory mapping error (11) in magma_dgetrf_gpu at dgetrf_gpu.cpp:163
CUDA runtime error: unspecified launch failure (4) in magma_device_sync at interface.cpp:79
CUDA runtime error: unspecified launch failure (4) in magma_free at alloc.cpp:37
CUBLAS error: memory mapping error (11) in magma_dgetrf_gpu at dgetrf_gpu.cpp:189
CUBLAS error: memory mapping error (11) in magma_dgetrf_gpu at dgetrf_gpu.cpp:222
CUDA runtime error: unspecified launch failure (4) in magma_free_pinned at alloc.cpp:108
MKL ERROR: Parameter 1 was incorrect on entry to magma_dgetrs_gpu
...


The application doesn't terminate, but does not converge in the Newton-Raphson iteration (i.e. the solve is garbage)

The problem first appears in the MAGMA_DGETRF_GPU routine (line 163 of src/dgetrf_gpu.cpp)
Code: Select all
...
line 160: // download i-th panel
line 161:                cols = maxm - i*nb;
line 162:                magmablas_dtranspose( dAP, cols, inAT(i,i), lddat, nb, cols );
line 163:                magma_dgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork );
line 164:
line 165:                // make sure that gpu queue is empty
line 166:                magma_device_sync();
...


I'm allocating the matrix and right-hand side for the system using the ISO_C_BINDING approach:
Code: Select all
Module cudaf
  Interface
    Integer(C_INT) Function cudaMallocHost(cPtr, size) &
        bind(C, name="cudaMallocHost")
      Use iso_c_binding
      Type(C_PTR) :: cPtr
      Integer(C_SIZE_T), value :: size
    End Function cudaMallocHost

    Integer(C_INT) Function cudaMalloc(dPtr, size) &
        bind(C, name="cudaMalloc")
      Use iso_c_binding
      Type(C_PTR) :: dPtr
      Integer(C_SIZE_T), value :: size
    End Function cudaMalloc
...
  End Interface
End Module cudaf

Module magmaf
  Interface
...
    Integer(C_INT) Function magma_dgetrf_gpu(m, n, dA, ldda, ipiv, info) &
        bind(C, name="magma_dgetrf_gpu")
      Use iso_c_binding
      Integer(C_INT), value :: m
      Integer(C_INT), value :: n
      Type(C_PTR) :: dA
      Integer(C_INT), value :: ldda
      Integer(C_INT) :: ipiv(*)
      Integer(C_INT) :: info
    End Function magma_dgetrf_gpu

    Integer(C_INT) Function magma_dgetrs_gpu(trans, n, nrhs, dA, ldda, ipiv, dB, lddb, info) &
        bind(C, name="magma_dgetrs_gpu")
      Use iso_c_binding
      Character(C_CHAR) :: trans
      Integer(C_INT), value :: n
      Integer(C_INT), value :: nrhs
      Type(C_PTR) :: dA
      Integer(C_INT), value :: ldda
      Integer(C_INT) :: ipiv(*)
      Type(C_PTR) :: dB
      Integer(C_INT), value :: lddb
      Integer(C_INT) :: info
    End Function magma_dgetrs_gpu
...
  End Interface
End Module magmaf

Module cublasf
  Interface
...
    Integer(C_INT) Function cublasSetMatrix(rows, cols, elemSize, hA_src, lda, dB_dst, lddb) &
        bind(C, name="cublasSetMatrix")
      Use iso_c_binding
      Integer(C_INT), value :: rows
      Integer(C_INT), value :: cols
      Integer(C_SIZE_T), value :: elemSize
      Real(C_DOUBLE) :: hA_src(*)
      Integer(C_INT), value :: lda
      Type(C_PTR) :: dB_dst
      Integer(C_INT), value :: lddb
    End Function cublasSetMatrix
...
  End Interface
End Module cublasf


Variable declaration:
Code: Select all
Module ode_data
  Use cudaf
...
  Real(8), Dimension(:,:), Pointer :: jac
  Real(8), Dimension(:), Pointer :: rhs
  Integer, Dimension(:) :: ipiv
  Type(C_PTR) :: cPtr_jac, cPtr_rhs, dPtr_jac, dPtr_rhs
  !$OMP THREADPRIVATE(jac, rhs, ipiv, cPtr_jac, cPtr_rhs, dPtr_jac, dPtr_rhs)
...
End Module ode_data


Allocation:
Code: Select all
Subroutine ode_init
  Use ode_data
  ...
  ldd = ny

! Initialize device
  Call cublas_init()
  ...
  !$OMP PARALLEL DEFAULT(SHARED)
! Allocate pinned host memory
  stat = cudaMallocHost(cPtr_jac, ny*ny*sizeof(cPtr_jac))
  Call c_f_pointer(cPtr_jac, jac, (/ny,ny/))
  stat = cudaMallocHost(cPtr_rhs, ny*sizeof(cPtr_rhs))
  Call c_f_pointer(cPtr_rhs, rhs, (/ny/))

! Allocate device memory
  stat = cudaMalloc(dPtr_jac, ldd*ny*sizeof(dPtr_jac))
  stat = cudaMalloc(dPtr_rhs, ldd*sizeof(dPtr_rhs))
...
  !$OMP END PARALLEL
...
End Subroutine ode_init


Assignment:
Code: Select all
Subroutine ode_build
  Use ode_data
  ...
  (build jac here)
  ...
  stat = cublasSetMatrix(ny, ny, sizeof(jac(1,1)), jac, ny, dPtr_jac, ldd)
  ...
  (build rhs here)
  ...
  stat = cublasSetMatrix(ny, 1, sizeof(rhs(1)), rhs, ny, dPtr_rhs, ldd)
  ...
End Subroutine ode_build


Solve:
Code: Select all
Subroutine ode_solve
  Use magmaf
  ...
! Solve linear system via LU decomposition with pivoting
  stat = magma_dgetrf_gpu(ny, ny, dPtr_jac, ldd, ipiv, info)
  stat = magma_dgetrs_gpu('N', ny, 1, dPtr_jac, ldd, ipiv, dPtr_rhs, ldd, info)

! Copy result back to host
  stat = cublasGetMatrix(ny, 1, sizeof(dPtr_rhs), dPtr_rhs, ldd, rhs, ny) 
...
End Subroutine ode_solve


For what it's worth, I'm compiling with Intel Composer XE 2011 (sp1.9.293) and my compile and link options are:
Code: Select all
ifort -real-size 64 -integer-size 32 -m64 -mavx -ip -O3 -openmp -c $(SRC) -o $(OBJ)
...
ifort -real-size 64 -integer-size 32 -m64 -mavx -ip -O3  -DADD_ -DHAVE_CUBLAS -DCUBLAS_GFORTRAN  -openmp $(ALLOBJ) \
  -L$(MAGMA_DIR)/lib -L$(CUDA_DIR)/lib64 -L$(MKL_DIR)/lib/intel64 \
  -lmagma -lmagmablas -lmagma -lcublas -lcudart -lcuda -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
...


Could anyone be of assistance? I've exhausted my knowledge of MAGMA/CUBLAS/CUDA and I can't seem to make this work.

EDIT: I should also add my make.inc options for MAGMA:
Code: Select all
...
GPU_TARGET = Kepler

CC        = gcc
NVCC      = nvcc
FORT      = gfortran
...
OPTS      = -O3 -DADD_ -DCUBLAS_GFORTRAN
F77OPTS   = -O3 -DADD_
FOPTS     = -O3 -DADD_ -x f95-cpp-input
NVOPTS    = -O3 -DADD_ --compiler-options -fno-strict-aliasing -DUNIX
LDOPTS    = -fPIC -Xlinker -zmuldefs
...
LIB       = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lcublas -lcudart -lm
...
LIBDIR    = -L$(MKLROOT)/lib/intel64 \
            -L$(CUDADIR)/lib64
INC       = -I$(CUDADIR)/include
jah87
 
Posts: 21
Joined: Tue May 01, 2012 1:54 pm

Return to User discussion

Who is online

Users browsing this forum: No registered users and 2 guests

cron