MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zgetrf_gpu.cpp File Reference
#include "common_magma.h"
Include dependency graph for zgetrf_gpu.cpp:

Go to the source code of this file.

Macros

#define dAT(i, j)   (dAT + (i)*nb*lddat + (j)*nb)
 

Functions

magma_int_t magma_zgetrf_gpu (magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
 

Macro Definition Documentation

#define dAT (   i,
 
)    (dAT + (i)*nb*lddat + (j)*nb)

Function Documentation

magma_int_t magma_zgetrf_gpu ( magma_int_t  m,
magma_int_t  n,
magmaDoubleComplex *  dA,
magma_int_t  ldda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 14 of file zgetrf_gpu.cpp.

References __func__, dA, dAT, lapackf77_zgetrf, zgehrd_data::ldda, MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free, magma_free_cpu(), magma_free_pinned, magma_get_zgetrf_nb(), magma_queue_create, magma_queue_destroy, magma_queue_sync, MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, magma_zgemm(), magma_zgetmatrix, magma_zgetmatrix_async, magma_zmalloc(), magma_zmalloc_cpu(), magma_zmalloc_pinned(), magma_zsetmatrix, magma_zsetmatrix_async, magma_ztrsm(), magmablas_zpermute_long2(), magmablas_ztranspose2(), magmablas_ztranspose_inplace(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaNoTrans, MagmaRight, MagmaUnit, MagmaUpper, max, and min.

17 {
18 /* -- MAGMA (version 1.4.0) --
19  Univ. of Tennessee, Knoxville
20  Univ. of California, Berkeley
21  Univ. of Colorado, Denver
22  August 2013
23 
24  Purpose
25  =======
26  ZGETRF computes an LU factorization of a general M-by-N matrix A
27  using partial pivoting with row interchanges.
28 
29  The factorization has the form
30  A = P * L * U
31  where P is a permutation matrix, L is lower triangular with unit
32  diagonal elements (lower trapezoidal if m > n), and U is upper
33  triangular (upper trapezoidal if m < n).
34 
35  This is the right-looking Level 3 BLAS version of the algorithm.
36  If the current stream is NULL, this version replaces it with user defined
37  stream to overlap computation with communication.
38 
39  Arguments
40  =========
41  M (input) INTEGER
42  The number of rows of the matrix A. M >= 0.
43 
44  N (input) INTEGER
45  The number of columns of the matrix A. N >= 0.
46 
47  A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
48  On entry, the M-by-N matrix to be factored.
49  On exit, the factors L and U from the factorization
50  A = P*L*U; the unit diagonal elements of L are not stored.
51 
52  LDDA (input) INTEGER
53  The leading dimension of the array A. LDDA >= max(1,M).
54 
55  IPIV (output) INTEGER array, dimension (min(M,N))
56  The pivot indices; for 1 <= i <= min(M,N), row i of the
57  matrix was interchanged with row IPIV(i).
58 
59  INFO (output) INTEGER
60  = 0: successful exit
61  < 0: if INFO = -i, the i-th argument had an illegal value
62  or another error occured, such as memory allocation failed.
63  > 0: if INFO = i, U(i,i) is exactly zero. The factorization
64  has been completed, but the factor U is exactly
65  singular, and division by zero will occur if it is used
66  to solve a system of equations.
67  ===================================================================== */
68 
69  #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)
70 
71  magmaDoubleComplex c_one = MAGMA_Z_ONE;
72  magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
73 
74  magma_int_t iinfo, nb;
75  magma_int_t maxm, maxn, mindim;
76  magma_int_t i, rows, cols, s, lddat, lddwork;
77  magmaDoubleComplex *dAT, *dAP, *work;
78 
79  /* Check arguments */
80  *info = 0;
81  if (m < 0)
82  *info = -1;
83  else if (n < 0)
84  *info = -2;
85  else if (ldda < max(1,m))
86  *info = -4;
87 
88  if (*info != 0) {
89  magma_xerbla( __func__, -(*info) );
90  return *info;
91  }
92 
93  /* Quick return if possible */
94  if (m == 0 || n == 0)
95  return *info;
96 
97  /* Function Body */
98  mindim = min(m, n);
99  nb = magma_get_zgetrf_nb(m);
100  s = mindim / nb;
101 
102  if (nb <= 1 || nb >= min(m,n)) {
103  /* Use CPU code. */
104  magma_zmalloc_cpu( &work, m * n );
105  if ( work == NULL ) {
106  *info = MAGMA_ERR_HOST_ALLOC;
107  return *info;
108  }
109  magma_zgetmatrix( m, n, dA, ldda, work, m );
110  lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
111  magma_zsetmatrix( m, n, work, m, dA, ldda );
112  magma_free_cpu(work);
113  }
114  else {
115  /* Use hybrid blocked code. */
116  maxm = ((m + 31)/32)*32;
117  maxn = ((n + 31)/32)*32;
118 
119  lddat = maxn;
120  lddwork = maxm;
121 
122  dAT = dA;
123 
124  if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
125  *info = MAGMA_ERR_DEVICE_ALLOC;
126  return *info;
127  }
128 
129  if ( m == n ) {
130  lddat = ldda;
132  }
133  else {
134  if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
135  magma_free( dAP );
136  *info = MAGMA_ERR_DEVICE_ALLOC;
137  return *info;
138  }
139  magmablas_ztranspose2( dAT, lddat, dA, ldda, m, n );
140  }
141 
142  if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) {
143  magma_free( dAP );
144  if ( ! (m == n))
145  magma_free( dAT );
146  *info = MAGMA_ERR_HOST_ALLOC;
147  return *info;
148  }
149 
150  /* Define user stream if current stream is NULL */
151  cudaStream_t stream[2], current_stream;
152  magmablasGetKernelStream(&current_stream);
153 
154  magma_queue_create( &stream[0] );
155  if (current_stream == NULL) {
156  magma_queue_create( &stream[1] );
157  magmablasSetKernelStream(stream[1]);
158  }
159  else
160  stream[1] = current_stream;
161 
162  for( i=0; i<s; i++ )
163  {
164  // download i-th panel
165  cols = maxm - i*nb;
166  //magmablas_ztranspose( dAP, cols, dAT(i,i), lddat, nb, cols );
167  magmablas_ztranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb );
168 
169  // make sure that that the transpose has completed
170  magma_queue_sync( stream[1] );
171  magma_zgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork,
172  stream[0]);
173 
174  if ( i>0 ){
176  n - (i+1)*nb, nb,
177  c_one, dAT(i-1,i-1), lddat,
178  dAT(i-1,i+1), lddat );
180  n-(i+1)*nb, m-i*nb, nb,
181  c_neg_one, dAT(i-1,i+1), lddat,
182  dAT(i, i-1), lddat,
183  c_one, dAT(i, i+1), lddat );
184  }
185 
186  // do the cpu part
187  rows = m - i*nb;
188  magma_queue_sync( stream[0] );
189  lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
190  if ( (*info == 0) && (iinfo > 0) )
191  *info = iinfo + i*nb;
192 
193  // upload i-th panel
194  magma_zsetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm,
195  stream[0]);
196 
197  magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb );
198 
199  magma_queue_sync( stream[0] );
200  //magmablas_ztranspose(dAT(i,i), lddat, dAP, maxm, cols, nb);
201  magmablas_ztranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb);
202 
203  // do the small non-parallel computations (next panel update)
204  if ( s > (i+1) ) {
206  nb, nb,
207  c_one, dAT(i, i ), lddat,
208  dAT(i, i+1), lddat);
210  nb, m-(i+1)*nb, nb,
211  c_neg_one, dAT(i, i+1), lddat,
212  dAT(i+1, i ), lddat,
213  c_one, dAT(i+1, i+1), lddat );
214  }
215  else {
217  n-s*nb, nb,
218  c_one, dAT(i, i ), lddat,
219  dAT(i, i+1), lddat);
221  n-(i+1)*nb, m-(i+1)*nb, nb,
222  c_neg_one, dAT(i, i+1), lddat,
223  dAT(i+1, i ), lddat,
224  c_one, dAT(i+1, i+1), lddat );
225  }
226  }
227 
228  magma_int_t nb0 = min(m - s*nb, n - s*nb);
229  rows = m - s*nb;
230  cols = maxm - s*nb;
231 
232  magmablas_ztranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows);
233  magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork );
234 
235  // do the cpu part
236  lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
237  if ( (*info == 0) && (iinfo > 0) )
238  *info = iinfo + s*nb;
239  magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb );
240 
241  // upload i-th panel
242  magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm );
243  magmablas_ztranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0);
244 
246  n-s*nb-nb0, nb0,
247  c_one, dAT(s,s), lddat,
248  dAT(s,s)+nb0, lddat);
249 
250  if ( m == n ) {
251  magmablas_ztranspose_inplace( m, dAT, lddat );
252  }
253  else {
254  magmablas_ztranspose2( dA, ldda, dAT, lddat, n, m );
255  magma_free( dAT );
256  }
257 
258  magma_free( dAP );
259  magma_free_pinned( work );
260 
261  magma_queue_destroy( stream[0] );
262  if (current_stream == NULL) {
263  magma_queue_destroy( stream[1] );
265  }
266  }
267  return *info;
268 } /* End of MAGMA_ZGETRF_GPU */
#define min(a, b)
Definition: common_magma.h:86
#define lapackf77_zgetrf
Definition: magma_zlapack.h:64
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define __func__
Definition: common_magma.h:65
#define magma_zgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_z.h:714
void magmablas_zpermute_long2(cuDoubleComplex *, magma_int_t, magma_int_t *, magma_int_t, magma_int_t)
#define MagmaUpper
Definition: magma.h:61
#define MAGMA_Z_NEG_ONE
Definition: magma.h:134
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
#define magma_free(ptr)
Definition: magma.h:57
#define magma_free_pinned(ptr)
Definition: magma.h:60
#define magma_zgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_z.h:705
int magma_int_t
Definition: magmablas.h:12
#define dAT(i, j)
#define magma_queue_destroy(queue)
Definition: magma.h:116
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
void magma_ztrsm(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
void magmablas_ztranspose_inplace(magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda)
magma_int_t ldda
static magma_err_t magma_zmalloc_pinned(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:92
#define MAGMA_SUCCESS
Definition: magma.h:106
#define magma_zsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_z.h:711
static magma_err_t magma_zmalloc_cpu(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:86
#define MagmaRight
Definition: magma.h:69
#define MagmaUnit
Definition: magma.h:66
#define MAGMA_Z_ONE
Definition: magma.h:132
#define MagmaNoTrans
Definition: magma.h:57
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
#define max(a, b)
Definition: common_magma.h:82
void magmablas_ztranspose2(cuDoubleComplex *, magma_int_t, cuDoubleComplex *, magma_int_t, magma_int_t, magma_int_t)
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
magma_int_t magma_get_zgetrf_nb(magma_int_t m)
Definition: get_nb.cpp:322
magma_err_t magma_free_cpu(void *ptr)
#define MAGMA_ERR_HOST_ALLOC
Definition: magma_types.h:275
#define dA(dev, i, j)
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function: