currently I have a problem when using MAGMA, pycuda and my own CUDA kernels together and I am not sure whether this is an issue with MAGMA, pycuda or my knowledge of CUDA.

In the code attached below, I am using CUDA via pycuda and also load MAGMA to perform a QR decomposition. In the end I also call a kernel tril to obtain R (of the QR decomposition). However, when I call the kernel, the program crashes with

Code: Select all

```
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
```

Thank you in advance!

Code: Select all

```
import ctypes
import numpy as np
import pycuda.autoinit
import pycuda.compiler
import pycuda.gpuarray as gpuarray
c_int_type = ctypes.c_longlong
c_ptr_type = ctypes.c_void_p
_libmagma = ctypes.cdll.LoadLibrary("libmagma.so")
magma_init = _libmagma.magma_init
magma_init.restype = int
magma_init.argtypes = []
magma_finalize = _libmagma.magma_finalize
magma_finalize.restype = int
magma_finalize.argtypes = []
_magma_get_zgeqrf_nb = _libmagma.magma_get_zgeqrf_nb
_magma_get_zgeqrf_nb.restype = int
_magma_get_zgeqrf_nb.argtypes = [
c_int_type, c_int_type # m, n
]
_magma_zgeqrf_gpu = _libmagma.magma_zgeqrf_gpu
_magma_zgeqrf_gpu.restype = int
_magma_zgeqrf_gpu.argtypes = [
c_int_type, c_int_type, # m, n,
c_ptr_type, c_int_type, # dA, ldda,
c_ptr_type, c_ptr_type, # tau, dT
c_ptr_type # info
]
magma_init()
### PART 1 ###
cuda_source_module = pycuda.compiler.SourceModule("""
#include <pycuda-complex.hpp>
__global__ void tril(pycuda::complex<double> * a, int m, int mn) {
int k = threadIdx.x + blockIdx.x * blockDim.x;
int kx = k / m;
int ky = k % m;
if (k < mn) {
if (kx < ky) {
a[k] = 0.0;
}
}
}
""")
_tril_raw = cuda_source_module.get_function("tril")
def _tril(a, blocksize=1024):
gridsize = int(np.ceil(a.size/(1.*blocksize)))
m, n = a.shape
_tril_raw(a, np.int32(m), np.int32(m*n),
block=(blocksize,1,1), grid=(gridsize,1))
### PART 1 END ###
### PART 2 ###
m = 3
n = 2
a = np.random.rand(m,n) + 1j*np.random.rand(m,n)
a = np.asfortranarray(a)
a_gpu = gpuarray.to_gpu(a)
k = min(m, n)
nb = _magma_get_zgeqrf_nb(m, n)
tau = np.empty(k, dtype=np.complex128)
Tsize = (2*k + int(np.ceil(n / 32.))*32)*nb
T = gpuarray.empty(Tsize, dtype=np.complex128)
info = c_int_type()
_magma_zgeqrf_gpu(m, n, int(a_gpu.gpudata), m,
int(tau.ctypes.data), int(T.gpudata),
ctypes.byref(info))
### PART 2 END ###
_tril(a_gpu)
print a_gpu
magma_finalize()
```