In the magmablas/zpermute-v2.cu, I see a couple of structs defined:

Code: Select all

```
typedef struct {
cuDoubleComplex *A;
int n, lda, j0;
short ipiv[BLOCK_SIZE];
} zlaswp_params_t;
typedef struct {
cuDoubleComplex *A;
int n, lda, j0, npivots;
short ipiv[BLOCK_SIZE];
} zlaswp_params_t2;
```

Code: Select all

```
extern "C" void
magmablas_zpermute_long2( magma_int_t n, cuDoubleComplex *dAT, magma_int_t lda,
magma_int_t *ipiv, magma_int_t nb, magma_int_t ind )
{
int k;
for( k = 0; k < nb-BLOCK_SIZE; k += BLOCK_SIZE )
{
//zlaswp_params_t params = { dAT, lda, lda, ind + k };
zlaswp_params_t2 params = { dAT, n, lda, ind + k, BLOCK_SIZE };
for( int j = 0; j < BLOCK_SIZE; j++ )
{
params.ipiv[j] = ipiv[ind + k + j] - k - 1;
ipiv[ind + k + j] += ind;
}
//zlaswp2( params );
zlaswp3( params );
}
...etc...
```

Code: Select all

```
extern "C" void zlaswp3( zlaswp_params_t2 ¶ms )
{
int blocksize = 64;
dim3 blocks = (params.n+blocksize-1) / blocksize;
myzlaswp2<<< blocks, blocksize, 0, magma_stream >>>( params );
}
```

Code: Select all

```
__global__ void myzlaswp2( zlaswp_params_t2 params )
{
unsigned int tid = threadIdx.x + __mul24(blockDim.x, blockIdx.x);
if( tid < params.n )
{
int lda = params.lda;
cuDoubleComplex *A = params.A + tid + lda * params.j0;
for( int i = 0; i < params.npivots; i++ )
{
int j = params.ipiv[i];
cuDoubleComplex *p1 = A + i*lda;
cuDoubleComplex *p2 = A + j*lda;
cuDoubleComplex temp = *p1;
*p1 = *p2;
*p2 = temp;
}
}
}
```

*is used in the gpu kernel. Here's my question: How can this work? The ipiv array is located on the host, not the device.*