currently I have a problem when using MAGMA, pycuda and my own CUDA kernels together and I am not sure whether this is an issue with MAGMA, pycuda or my knowledge of CUDA.

In the code attached below, I am using CUDA via pycuda and also load MAGMA to perform a QR decomposition. In the end I also call a kernel tril to obtain R (of the QR decomposition). However, when I call the kernel, the program crashes with

- Code: Select all
`pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle`

If I swap the code parts 1 and 2, the program gives the correct result. So my question is: Is there a specific sequence of calling MAGMA functions and compiling/calling my own kernels? Or is this maybe a pycuda issue?

Thank you in advance!

- Code: Select all
`import ctypes`

import numpy as np

import pycuda.autoinit

import pycuda.compiler

import pycuda.gpuarray as gpuarray

c_int_type = ctypes.c_longlong

c_ptr_type = ctypes.c_void_p

_libmagma = ctypes.cdll.LoadLibrary("libmagma.so")

magma_init = _libmagma.magma_init

magma_init.restype = int

magma_init.argtypes = []

magma_finalize = _libmagma.magma_finalize

magma_finalize.restype = int

magma_finalize.argtypes = []

_magma_get_zgeqrf_nb = _libmagma.magma_get_zgeqrf_nb

_magma_get_zgeqrf_nb.restype = int

_magma_get_zgeqrf_nb.argtypes = [

c_int_type, c_int_type # m, n

]

_magma_zgeqrf_gpu = _libmagma.magma_zgeqrf_gpu

_magma_zgeqrf_gpu.restype = int

_magma_zgeqrf_gpu.argtypes = [

c_int_type, c_int_type, # m, n,

c_ptr_type, c_int_type, # dA, ldda,

c_ptr_type, c_ptr_type, # tau, dT

c_ptr_type # info

]

magma_init()

### PART 1 ###

cuda_source_module = pycuda.compiler.SourceModule("""

#include <pycuda-complex.hpp>

__global__ void tril(pycuda::complex<double> * a, int m, int mn) {

int k = threadIdx.x + blockIdx.x * blockDim.x;

int kx = k / m;

int ky = k % m;

if (k < mn) {

if (kx < ky) {

a[k] = 0.0;

}

}

}

""")

_tril_raw = cuda_source_module.get_function("tril")

def _tril(a, blocksize=1024):

gridsize = int(np.ceil(a.size/(1.*blocksize)))

m, n = a.shape

_tril_raw(a, np.int32(m), np.int32(m*n),

block=(blocksize,1,1), grid=(gridsize,1))

### PART 1 END ###

### PART 2 ###

m = 3

n = 2

a = np.random.rand(m,n) + 1j*np.random.rand(m,n)

a = np.asfortranarray(a)

a_gpu = gpuarray.to_gpu(a)

k = min(m, n)

nb = _magma_get_zgeqrf_nb(m, n)

tau = np.empty(k, dtype=np.complex128)

Tsize = (2*k + int(np.ceil(n / 32.))*32)*nb

T = gpuarray.empty(Tsize, dtype=np.complex128)

info = c_int_type()

_magma_zgeqrf_gpu(m, n, int(a_gpu.gpudata), m,

int(tau.ctypes.data), int(T.gpudata),

ctypes.byref(info))

### PART 2 END ###

_tril(a_gpu)

print a_gpu

magma_finalize()