MAGMA  magma-1.4.0 Matrix Algebra on GPU and Multicore Architectures
dgetrf_gpu.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
6  August 2013
7
8  @author Stan Tomov
9  @generated d Tue Aug 13 16:44:10 2013
10 */
11 #include "common_magma.h"
12
13 extern "C" magma_int_t
15  double *dA, magma_int_t ldda,
16  magma_int_t *ipiv, magma_int_t *info)
17 {
18 /* -- MAGMA (version 1.4.0) --
19  Univ. of Tennessee, Knoxville
20  Univ. of California, Berkeley
22  August 2013
23
24  Purpose
25  =======
26  DGETRF computes an LU factorization of a general M-by-N matrix A
27  using partial pivoting with row interchanges.
28
29  The factorization has the form
30  A = P * L * U
31  where P is a permutation matrix, L is lower triangular with unit
32  diagonal elements (lower trapezoidal if m > n), and U is upper
33  triangular (upper trapezoidal if m < n).
34
35  This is the right-looking Level 3 BLAS version of the algorithm.
36  If the current stream is NULL, this version replaces it with user defined
37  stream to overlap computation with communication.
38
39  Arguments
40  =========
41  M (input) INTEGER
42  The number of rows of the matrix A. M >= 0.
43
44  N (input) INTEGER
45  The number of columns of the matrix A. N >= 0.
46
47  A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
48  On entry, the M-by-N matrix to be factored.
49  On exit, the factors L and U from the factorization
50  A = P*L*U; the unit diagonal elements of L are not stored.
51
52  LDDA (input) INTEGER
53  The leading dimension of the array A. LDDA >= max(1,M).
54
55  IPIV (output) INTEGER array, dimension (min(M,N))
56  The pivot indices; for 1 <= i <= min(M,N), row i of the
57  matrix was interchanged with row IPIV(i).
58
59  INFO (output) INTEGER
60  = 0: successful exit
61  < 0: if INFO = -i, the i-th argument had an illegal value
62  or another error occured, such as memory allocation failed.
63  > 0: if INFO = i, U(i,i) is exactly zero. The factorization
64  has been completed, but the factor U is exactly
65  singular, and division by zero will occur if it is used
66  to solve a system of equations.
67  ===================================================================== */
68
69  #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)
70
71  double c_one = MAGMA_D_ONE;
72  double c_neg_one = MAGMA_D_NEG_ONE;
73
74  magma_int_t iinfo, nb;
75  magma_int_t maxm, maxn, mindim;
76  magma_int_t i, rows, cols, s, lddat, lddwork;
77  double *dAT, *dAP, *work;
78
79  /* Check arguments */
80  *info = 0;
81  if (m < 0)
82  *info = -1;
83  else if (n < 0)
84  *info = -2;
85  else if (ldda < max(1,m))
86  *info = -4;
87
88  if (*info != 0) {
89  magma_xerbla( __func__, -(*info) );
90  return *info;
91  }
92
93  /* Quick return if possible */
94  if (m == 0 || n == 0)
95  return *info;
96
97  /* Function Body */
98  mindim = min(m, n);
99  nb = magma_get_dgetrf_nb(m);
100  s = mindim / nb;
101
102  if (nb <= 1 || nb >= min(m,n)) {
103  /* Use CPU code. */
104  magma_dmalloc_cpu( &work, m * n );
105  if ( work == NULL ) {
106  *info = MAGMA_ERR_HOST_ALLOC;
107  return *info;
108  }
109  magma_dgetmatrix( m, n, dA, ldda, work, m );
110  lapackf77_dgetrf(&m, &n, work, &m, ipiv, info);
111  magma_dsetmatrix( m, n, work, m, dA, ldda );
112  magma_free_cpu(work);
113  }
114  else {
115  /* Use hybrid blocked code. */
116  maxm = ((m + 31)/32)*32;
117  maxn = ((n + 31)/32)*32;
118
119  lddat = maxn;
120  lddwork = maxm;
121
122  dAT = dA;
123
124  if (MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) {
125  *info = MAGMA_ERR_DEVICE_ALLOC;
126  return *info;
127  }
128
129  if ( m == n ) {
130  lddat = ldda;
131  magmablas_dtranspose_inplace( m, dAT, ldda );
132  }
133  else {
134  if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) {
135  magma_free( dAP );
136  *info = MAGMA_ERR_DEVICE_ALLOC;
137  return *info;
138  }
139  magmablas_dtranspose2( dAT, lddat, dA, ldda, m, n );
140  }
141
142  if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) {
143  magma_free( dAP );
144  if ( ! (m == n))
145  magma_free( dAT );
146  *info = MAGMA_ERR_HOST_ALLOC;
147  return *info;
148  }
149
150  /* Define user stream if current stream is NULL */
151  cudaStream_t stream[2], current_stream;
152  magmablasGetKernelStream(&current_stream);
153
154  magma_queue_create( &stream[0] );
155  if (current_stream == NULL) {
156  magma_queue_create( &stream[1] );
157  magmablasSetKernelStream(stream[1]);
158  }
159  else
160  stream[1] = current_stream;
161
162  for( i=0; i<s; i++ )
163  {
165  cols = maxm - i*nb;
166  //magmablas_dtranspose( dAP, cols, dAT(i,i), lddat, nb, cols );
167  magmablas_dtranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb );
168
169  // make sure that that the transpose has completed
170  magma_queue_sync( stream[1] );
171  magma_dgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork,
172  stream[0]);
173
174  if ( i>0 ){
176  n - (i+1)*nb, nb,
177  c_one, dAT(i-1,i-1), lddat,
178  dAT(i-1,i+1), lddat );
180  n-(i+1)*nb, m-i*nb, nb,
181  c_neg_one, dAT(i-1,i+1), lddat,
182  dAT(i, i-1), lddat,
183  c_one, dAT(i, i+1), lddat );
184  }
185
186  // do the cpu part
187  rows = m - i*nb;
188  magma_queue_sync( stream[0] );
189  lapackf77_dgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
190  if ( (*info == 0) && (iinfo > 0) )
191  *info = iinfo + i*nb;
192
194  magma_dsetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm,
195  stream[0]);
196
197  magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb, i*nb );
198
199  magma_queue_sync( stream[0] );
200  //magmablas_dtranspose(dAT(i,i), lddat, dAP, maxm, cols, nb);
201  magmablas_dtranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb);
202
203  // do the small non-parallel computations (next panel update)
204  if ( s > (i+1) ) {
206  nb, nb,
207  c_one, dAT(i, i ), lddat,
208  dAT(i, i+1), lddat);
210  nb, m-(i+1)*nb, nb,
211  c_neg_one, dAT(i, i+1), lddat,
212  dAT(i+1, i ), lddat,
213  c_one, dAT(i+1, i+1), lddat );
214  }
215  else {
217  n-s*nb, nb,
218  c_one, dAT(i, i ), lddat,
219  dAT(i, i+1), lddat);
221  n-(i+1)*nb, m-(i+1)*nb, nb,
222  c_neg_one, dAT(i, i+1), lddat,
223  dAT(i+1, i ), lddat,
224  c_one, dAT(i+1, i+1), lddat );
225  }
226  }
227
228  magma_int_t nb0 = min(m - s*nb, n - s*nb);
229  rows = m - s*nb;
230  cols = maxm - s*nb;
231
232  magmablas_dtranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows);
233  magma_dgetmatrix( rows, nb0, dAP, maxm, work, lddwork );
234
235  // do the cpu part
236  lapackf77_dgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
237  if ( (*info == 0) && (iinfo > 0) )
238  *info = iinfo + s*nb;
239  magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb );
240
242  magma_dsetmatrix( rows, nb0, work, lddwork, dAP, maxm );
243  magmablas_dtranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0);
244
246  n-s*nb-nb0, nb0,
247  c_one, dAT(s,s), lddat,
248  dAT(s,s)+nb0, lddat);
249
250  if ( m == n ) {
251  magmablas_dtranspose_inplace( m, dAT, lddat );
252  }
253  else {
254  magmablas_dtranspose2( dA, ldda, dAT, lddat, n, m );
255  magma_free( dAT );
256  }
257
258  magma_free( dAP );
259  magma_free_pinned( work );
260
261  magma_queue_destroy( stream[0] );
262  if (current_stream == NULL) {
263  magma_queue_destroy( stream[1] );
265  }
266  }
267  return *info;
268 } /* End of MAGMA_DGETRF_GPU */
static magma_err_t magma_dmalloc_pinned(double **ptrPtr, size_t n)
Definition: magma.h:90
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define magma_dsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_d.h:711
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define __func__
Definition: common_magma.h:65
#define MagmaUpper
Definition: magma.h:61
static magma_err_t magma_dmalloc(magmaDouble_ptr *ptrPtr, size_t n)
Definition: magma.h:78
magma_int_t magma_dgetrf_gpu(magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
Definition: dgetrf_gpu.cpp:14
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
magma_int_t ldda
#define magma_free(ptr)
Definition: magma.h:57
void magma_dtrsm(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb)
#define magma_free_pinned(ptr)
Definition: magma.h:60
int magma_int_t
Definition: magmablas.h:12
#define magma_dgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_d.h:714
#define magma_dgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_d.h:705
#define magma_queue_destroy(queue)
Definition: magma.h:116
void magmablas_dtranspose_inplace(magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void lapackf77_dgetrf(const magma_int_t *m, const magma_int_t *n, double *A, const magma_int_t *lda, magma_int_t *ipiv, magma_int_t *info)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
void magmablas_dpermute_long2(magma_int_t n, magmaDouble_ptr dAT, magma_int_t ldda, magma_int_t *ipiv, magma_int_t nb, magma_int_t ind)
static magma_err_t magma_dmalloc_cpu(double **ptrPtr, size_t n)
Definition: magma.h:84
#define MAGMA_SUCCESS
Definition: magma.h:106
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define MagmaRight
Definition: magma.h:69
void magmablas_dtranspose2(magmaDouble_ptr odata, magma_int_t ldo, magmaDouble_const_ptr idata, magma_int_t ldi, magma_int_t m, magma_int_t n)
#define MagmaUnit
Definition: magma.h:66
#define MAGMA_D_NEG_ONE
Definition: magma.h:178
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
magma_int_t magma_get_dgetrf_nb(magma_int_t m)
Definition: get_nb.cpp:285
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
magma_err_t magma_free_cpu(void *ptr)
#define MAGMA_ERR_HOST_ALLOC
Definition: magma_types.h:275
#define dAT(i, j)
#define dA(dev, i, j)
#define magma_dsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_d.h:702
#define magma_queue_sync(queue)
Definition: magma.h:119