MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dgetrf_gpu.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @generated d Thu May 10 22:26:45 2012
9 
10 */
11 #include "common_magma.h"
12 
13 // === Define what BLAS to use ============================================
14 #define PRECISION_d
15 #if (defined(PRECISION_s) || defined(PRECISION_d))
16  #define magma_dgemm magmablas_dgemm
17  #define magma_dtrsm magmablas_dtrsm
18 #endif
19 // === End defining what BLAS to use =======================================
20 
21 extern "C" magma_int_t
23  double *dA, magma_int_t ldda,
24  magma_int_t *ipiv, magma_int_t *info)
25 {
26 /* -- MAGMA (version 1.2.0) --
27  Univ. of Tennessee, Knoxville
28  Univ. of California, Berkeley
29  Univ. of Colorado, Denver
30  May 2012
31 
32  Purpose
33  =======
34 
35  DGETRF computes an LU factorization of a general M-by-N matrix A
36  using partial pivoting with row interchanges.
37 
38  The factorization has the form
39  A = P * L * U
40  where P is a permutation matrix, L is lower triangular with unit
41  diagonal elements (lower trapezoidal if m > n), and U is upper
42  triangular (upper trapezoidal if m < n).
43 
44  This is the right-looking Level 3 BLAS version of the algorithm.
45 
46  Arguments
47  =========
48 
49  M (input) INTEGER
50  The number of rows of the matrix A. M >= 0.
51 
52  N (input) INTEGER
53  The number of columns of the matrix A. N >= 0.
54 
55  A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
56  On entry, the M-by-N matrix to be factored.
57  On exit, the factors L and U from the factorization
58  A = P*L*U; the unit diagonal elements of L are not stored.
59 
60  LDDA (input) INTEGER
61  The leading dimension of the array A. LDDA >= max(1,M).
62 
63  IPIV (output) INTEGER array, dimension (min(M,N))
64  The pivot indices; for 1 <= i <= min(M,N), row i of the
65  matrix was interchanged with row IPIV(i).
66 
67  INFO (output) INTEGER
68  = 0: successful exit
69  < 0: if INFO = -i, the i-th argument had an illegal value
70  or another error occured, such as memory allocation failed.
71  > 0: if INFO = i, U(i,i) is exactly zero. The factorization
72  has been completed, but the factor U is exactly
73  singular, and division by zero will occur if it is used
74  to solve a system of equations.
75  ===================================================================== */
76 
77 #define inAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)
78 
79  double c_one = MAGMA_D_ONE;
80  double c_neg_one = MAGMA_D_NEG_ONE;
81 
82  magma_int_t iinfo, nb;
83  magma_int_t maxm, maxn, mindim;
84  magma_int_t i, rows, cols, s, lddat, lddwork;
85  double *dAT, *dAP, *work;
86 
87  /* Check arguments */
88  *info = 0;
89  if (m < 0)
90  *info = -1;
91  else if (n < 0)
92  *info = -2;
93  else if (ldda < max(1,m))
94  *info = -4;
95 
96  if (*info != 0) {
97  magma_xerbla( __func__, -(*info) );
98  return *info;
99  }
100 
101  /* Quick return if possible */
102  if (m == 0 || n == 0)
103  return *info;
104 
105  /* Function Body */
106  mindim = min(m, n);
107  nb = magma_get_dgetrf_nb(m);
108  s = mindim / nb;
109 
110  if (nb <= 1 || nb >= min(m,n)) {
111  /* Use CPU code. */
112  work = (double*)malloc(m * n * sizeof(double));
113  magma_dgetmatrix( m, n, dA, ldda, work, m );
114  lapackf77_dgetrf(&m, &n, work, &m, ipiv, info);
115  magma_dsetmatrix( m, n, work, m, dA, ldda );
116  free(work);
117  }
118  else {
119  /* Use hybrid blocked code. */
120  maxm = ((m + 31)/32)*32;
121  maxn = ((n + 31)/32)*32;
122 
123  lddat = maxn;
124  lddwork = maxm;
125 
126  dAT = dA;
127 
128  if (MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) {
129  *info = MAGMA_ERR_DEVICE_ALLOC;
130  return *info;
131  }
132 
133  if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){
134  lddat = ldda;
135  magmablas_dinplace_transpose( dAT, ldda, m);
136  }
137  else {
138  if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) {
139  magma_free( dAP );
140  *info = MAGMA_ERR_DEVICE_ALLOC;
141  return *info;
142  }
143  magmablas_dtranspose2( dAT, lddat, dA, ldda, m, n );
144  }
145 
146  if (MAGMA_SUCCESS != magma_dmalloc_host( &work, maxm*nb )) {
147  magma_free( dAP );
148  if (! ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) )
149  magma_free( dAT );
150  *info = MAGMA_ERR_HOST_ALLOC;
151  return *info;
152  }
153 
154  for( i=0; i<s; i++ )
155  {
156  // download i-th panel
157  cols = maxm - i*nb;
158  magmablas_dtranspose( dAP, cols, inAT(i,i), lddat, nb, cols );
159  magma_dgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork );
160 
161  // make sure that gpu queue is empty
163 
164  if ( i>0 ){
166  n - (i+1)*nb, nb,
167  c_one, inAT(i-1,i-1), lddat,
168  inAT(i-1,i+1), lddat );
170  n-(i+1)*nb, m-i*nb, nb,
171  c_neg_one, inAT(i-1,i+1), lddat,
172  inAT(i, i-1), lddat,
173  c_one, inAT(i, i+1), lddat );
174  }
175 
176  // do the cpu part
177  rows = m - i*nb;
178  lapackf77_dgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
179  if ( (*info == 0) && (iinfo > 0) )
180  *info = iinfo + i*nb;
181 
182  magmablas_dpermute_long2( dAT, lddat, ipiv, nb, i*nb );
183 
184  // upload i-th panel
185  magma_dsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm );
186  magmablas_dtranspose(inAT(i,i), lddat, dAP, maxm, cols, nb);
187 
188  // do the small non-parallel computations
189  if ( s > (i+1) ) {
191  nb, nb,
192  c_one, inAT(i, i ), lddat,
193  inAT(i, i+1), lddat);
195  nb, m-(i+1)*nb, nb,
196  c_neg_one, inAT(i, i+1), lddat,
197  inAT(i+1, i ), lddat,
198  c_one, inAT(i+1, i+1), lddat );
199  }
200  else {
202  n-s*nb, nb,
203  c_one, inAT(i, i ), lddat,
204  inAT(i, i+1), lddat);
206  n-(i+1)*nb, m-(i+1)*nb, nb,
207  c_neg_one, inAT(i, i+1), lddat,
208  inAT(i+1, i ), lddat,
209  c_one, inAT(i+1, i+1), lddat );
210  }
211  }
212 
213  magma_int_t nb0 = min(m - s*nb, n - s*nb);
214  rows = m - s*nb;
215  cols = maxm - s*nb;
216 
217  magmablas_dtranspose2( dAP, maxm, inAT(s,s), lddat, nb0, rows);
218  magma_dgetmatrix( rows, nb0, dAP, maxm, work, lddwork );
219 
220  // make sure that gpu queue is empty
222 
223  // do the cpu part
224  lapackf77_dgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
225  if ( (*info == 0) && (iinfo > 0) )
226  *info = iinfo + s*nb;
227  magmablas_dpermute_long2( dAT, lddat, ipiv, nb0, s*nb );
228 
229  // upload i-th panel
230  magma_dsetmatrix( rows, nb0, work, lddwork, dAP, maxm );
231  magmablas_dtranspose2( inAT(s,s), lddat, dAP, maxm, rows, nb0);
232 
234  n-s*nb-nb0, nb0,
235  c_one, inAT(s,s), lddat,
236  inAT(s,s)+nb0, lddat);
237 
238  if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){
239  magmablas_dinplace_transpose( dAT, lddat, m );
240  }
241  else {
242  magmablas_dtranspose2( dA, ldda, dAT, lddat, n, m );
243  magma_free( dAT );
244  }
245 
246  magma_free( dAP );
247  magma_free_host( work );
248  }
249  return *info;
250 
251  /* End of MAGMA_DGETRF_GPU */
252 }
253 
254 #undef inAT