MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cpotrf2_ooc.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @generated c Thu May 10 22:26:43 2012
9 
10 */
11 #include "common_magma.h"
12 
13 /* === Define what BLAS to use ============================================ */
14 #define PRECISION_c
15 #if (defined(PRECISION_s) || defined(PRECISION_d))
16  #define magma_dgemm magmablas_dgemm
17  #define magma_dtrsm magmablas_dtrsm
18 #endif
19 
20 #if (GPUSHMEM >= 200)
21 #if (defined(PRECISION_s))
22  #undef magma_sgemm
23  #define magma_sgemm magmablas_sgemm_fermi80
24  #endif
25 #endif
26 /* === End defining what BLAS to use ====================================== */
27 // Flops formula
28 #include "../testing/flops.h"
29 #define PRECISION_c
30 #if defined(PRECISION_z) || defined(PRECISION_c)
31 #define FLOPS(n) ( 6. * FMULS_POTRF(n) + 2. * FADDS_POTRF(n) )
32 #else
33 #define FLOPS(n) ( FMULS_POTRF(n) + FADDS_POTRF(n) )
34 #endif
35 
36 extern "C" magma_int_t
37 magma_chtodpo(int num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
38  cuFloatComplex *h_A, magma_int_t lda, cuFloatComplex **d_lA, magma_int_t ldda, cudaStream_t **stream,
39  magma_int_t *info);
40 
41 extern "C" magma_int_t
42 magma_cdtohpo(int num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB,
43  cuFloatComplex *a, magma_int_t lda, cuFloatComplex **work, magma_int_t ldda, cudaStream_t **stream,
44  magma_int_t *info);
45 
46 extern "C" magma_int_t
47 magma_cpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
48  cuFloatComplex **d_lA, magma_int_t ldda, cuFloatComplex **d_lP, magma_int_t lddlp,
49  cuFloatComplex *work, magma_int_t ldwrk, cudaStream_t **streaml, magma_int_t *info);
50 
51 #define A(i, j) (a +(j)*lda + (i))
52 #define dA(d, i, j) (dwork[(d)]+(j)*lddla + (i))
53 #define dT(d, i, j) (dt[(d)] +(j)*ldda + (i))
54 #define dAup(d, i, j) (dwork[(d)]+(j)*NB + (i))
55 #define dTup(d, i, j) (dt[(d)] +(j)*nb + (i))
56 
57 extern "C" magma_int_t
59  cuFloatComplex *a, magma_int_t lda, magma_int_t *info)
60 {
61 /* -- MAGMA (version 1.2.0) --
62  Univ. of Tennessee, Knoxville
63  Univ. of California, Berkeley
64  Univ. of Colorado, Denver
65  May 2012
66 
67  Purpose
68  =======
69 
70  CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian
71  positive definite matrix A. This version does not require work
72  space on the GPU passed as input. GPU memory is allocated in the
73  routine. The matrix A may not fit entirely in the GPU memory.
74 
75  The factorization has the form
76  A = U**H * U, if UPLO = 'U', or
77  A = L * L**H, if UPLO = 'L',
78  where U is an upper triangular matrix and L is lower triangular.
79 
80  This is the block version of the algorithm, calling Level 3 BLAS.
81 
82  Arguments
83  =========
84 
85  UPLO (input) CHARACTER*1
86  = 'U': Upper triangle of A is stored;
87  = 'L': Lower triangle of A is stored.
88 
89  N (input) INTEGER
90  The order of the matrix A. N >= 0.
91 
92  A (input/output) COMPLEX array, dimension (LDA,N)
93  On entry, the symmetric matrix A. If UPLO = 'U', the leading
94  N-by-N upper triangular part of A contains the upper
95  triangular part of the matrix A, and the strictly lower
96  triangular part of A is not referenced. If UPLO = 'L', the
97  leading N-by-N lower triangular part of A contains the lower
98  triangular part of the matrix A, and the strictly upper
99  triangular part of A is not referenced.
100 
101  On exit, if INFO = 0, the factor U or L from the Cholesky
102  factorization A = U**H * U or A = L * L**H.
103 
104  Higher performance is achieved if A is in pinned memory, e.g.
105  allocated using magma_malloc_host.
106 
107  LDA (input) INTEGER
108  The leading dimension of the array A. LDA >= max(1,N).
109 
110  INFO (output) INTEGER
111  = 0: successful exit
112  < 0: if INFO = -i, the i-th argument had an illegal value
113  or another error occured, such as memory allocation failed.
114  > 0: if INFO = i, the leading minor of order i is not
115  positive definite, and the factorization could not be
116  completed.
117 
118  ===================================================================== */
119 
120 
121  /* Local variables */
122  cuFloatComplex c_one = MAGMA_C_ONE;
123  cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
124  cuFloatComplex *dwork[4], *dt[4], *work;
125 
126  char uplo_[2] = {uplo, 0};
127  magma_int_t ldda, lddla, ldwrk, nb, iinfo, n_local[4], J2, d, num_gpus;
128  static magma_int_t j, jj, jb, jb1, jb2, jb3, J, JB, NB, MB;
129  float d_one = 1.0;
130  float d_neg_one = -1.0;
131  long int upper = lapackf77_lsame(uplo_, "U");
132 #if CUDA_VERSION > 3010
133  size_t totalMem;
134 #else
135  unsigned int totalMem;
136 #endif
137  CUdevice dev;
138  static cudaStream_t stream[4][3];
139 //#define ROW_MAJOR_PROFILE
140 #ifdef ROW_MAJOR_PROFILE
141  magma_timestr_t start, end, start0, end0;
142  float chol_time = 1.0;
143 #endif
144  *info = 0;
145  if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
146  *info = -1;
147  } else if (n < 0) {
148  *info = -2;
149  } else if (lda < max(1,n)) {
150  *info = -4;
151  }
152  if (*info != 0) {
153  magma_xerbla( __func__, -(*info) );
154  return *info;
155  }
156 
157  /* Quick return */
158  if ( n == 0 )
159  return *info;
160 
161  nb = magma_get_dpotrf_nb(n);
162  if( num_gpus0 > n/nb ) {
163  num_gpus = n/nb;
164  if( n%nb != 0 ) num_gpus ++;
165  } else {
166  num_gpus = num_gpus0;
167  }
168  ldda = n/(nb*num_gpus);
169  if( n%(nb*num_gpus) != 0 ) ldda++;
170  ldda = num_gpus*((nb*ldda+31)/32)*32;
171 
172  /* figure out NB */
173  cuDeviceGet( &dev, 0);
174  cuDeviceTotalMem( &totalMem, dev );
175  totalMem /= sizeof(cuFloatComplex);
176  MB = n; /* number of rows in the big panel */
177  NB = (magma_int_t)(num_gpus*(0.8*totalMem/ldda-2*nb)); /* number of columns in the big panel */
178  if( NB >= n ) {
179 #ifdef CHECK_CPOTRF_OOC
180  printf( " * still fit in GPU memory.\n" );
181 #endif
182  NB = n;
183  } else {
184 #ifdef CHECK_CPOTRF_OOC
185  printf( " * don't fit in GPU memory.\n" );
186 #endif
187  NB = (NB / nb) * nb; /* making sure it's devisable by nb */
188  }
189 #ifdef CHECK_CPOTRF_OOC
190  if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
191  else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
192  fflush(stdout);
193 #endif
194  ldda = ((n+31)/32)*32;
195  lddla = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
196  for (d=0; d<num_gpus; d++ ) {
197  magma_setdevice(d);
198  if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + 2*nb*ldda )) {
199  *info = MAGMA_ERR_DEVICE_ALLOC;
200  return *info;
201  }
202  dwork[d] = &dt[d][2*nb*ldda];
203  magma_queue_create( &stream[d][0] );
204  magma_queue_create( &stream[d][1] );
205  magma_queue_create( &stream[d][2] );
206  }
207 #ifdef ROW_MAJOR_PROFILE
208  start0 = get_current_time();
209 #endif
210  magma_setdevice(0);
211  ldwrk = n;
212  if (MAGMA_SUCCESS != magma_cmalloc_host( &work, ldwrk*nb )) {
213  *info = MAGMA_ERR_HOST_ALLOC;
214  return *info;
215  }
216 
217  if (nb <= 1 || nb >= n) {
218  lapackf77_cpotrf(uplo_, &n, a, &lda, info);
219  } else {
220 
221  /* Use hybrid blocked code. */
222  if (upper) {
223  /* =========================================================== *
224  * Compute the Cholesky factorization A = U'*U. *
225  * big panel is divided by block-row and distributed in block *
226  * column cyclic format */
227 
228  /* for each big-panel */
229  for( J=0; J<n; J+=NB ) {
230  JB = min(NB,n-J);
231  jb = min(JB,nb);
232  if( num_gpus0 > (n-J)/nb ) {
233  num_gpus = (n-J)/nb;
234  if( (n-J)%nb != 0 ) num_gpus ++;
235  } else {
236  num_gpus = num_gpus0;
237  }
238 
239  /* load the new big-panel by block-rows */
240  magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
241 
242 #ifdef ROW_MAJOR_PROFILE
243  start = get_current_time();
244 #endif
245  /* update with the previous big-panels */
246  for( j=0; j<J; j+=nb ) {
247  /* upload the diagonal of big panel */
248  for( d=0; d<num_gpus; d++ ) {
249  magma_setdevice(d);
250  magma_csetmatrix_async( nb, JB,
251  A(j, J), lda,
252  dTup(d, 0, J), nb, stream[d][0] );
253  n_local[d] = 0;
254  }
255 
256  /* upload off-diagonals */
257  for( jj=J+JB; jj<n; jj+=nb ) {
258  d = ((jj-J)/nb)%num_gpus;
259  magma_setdevice(d);
260 
261  jb2 = min(nb, n-jj);
262  magma_csetmatrix_async( nb, jb2,
263  A(j, jj), lda,
264  dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] );
265  n_local[d] += jb2;
266  }
267 
268  /* update the current big-panel using the previous block-row */
269  jb3 = nb; //min(nb,J-j); // number of columns in this previous block-column (nb)
270  for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
271  d = (jj/nb)%num_gpus;
272  magma_setdevice(d);
273 
274  J2 = (jj/(nb*num_gpus))*nb;
275  jb1 = min(JB,jj+nb); // first row in the next block-row
276  jb2 = min(nb,JB-jj); // number of rows in this current block-row
277  jb = jj; //jb1-jb2; // number of columns in the off-diagona blocks (jj)
279  jb, jb2, nb,
280  c_neg_one, dTup(d, 0, J ), nb,
281  dTup(d, 0, J+jb), nb,
282  c_one, dAup(d, 0, J2), NB);
283 
285  d_neg_one, dTup(d, 0, J+jb), nb,
286  d_one, dAup(d, jb, J2), NB);
287  }
288 
289  if( n > J+JB ) { /* off-diagonal */
290  for( d=0; d<num_gpus; d++ ) {
291  magma_setdevice(d);
292  /* local number of columns in the big panel */
293  n_local[d] = (((n-J)/nb)/num_gpus)*nb;
294  if (d < ((n-J)/nb)%num_gpus)
295  n_local[d] += nb;
296  else if (d == ((n-J)/nb)%num_gpus)
297  n_local[d] += (n-J)%nb;
298 
299  /* local number of columns in diagonal */
300  n_local[d] -= ((JB/nb)/num_gpus)*nb;
301  if (d < (JB/nb)%num_gpus)
302  n_local[d] -= nb;
303 
304  J2 = nb*(JB/(nb*num_gpus));
305  if( d < (JB/nb)%num_gpus ) J2+=nb;
306 
308  JB, n_local[d], nb,
309  c_neg_one, dTup(d, 0, J ), nb,
310  dTup(d, 0, J+JB), nb,
311  c_one, dAup(d, 0, J2), NB);
312  }
313  }
314  } /* end of updates with previous rows */
315 
316  /* factor the big panel */
317  magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
318  if( iinfo != 0 ) {
319  *info = J+iinfo;
320  break;
321  }
322 #ifdef ROW_MAJOR_PROFILE
323  end = get_current_time();
324  chol_time += GetTimerValue(start, end);
325 #endif
326 
327  /* upload the off-diagonal (and diagonal!!!) big panel */
328  magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
329 
330  }
331  } else {
332  /* ========================================================= *
333  * Compute the Cholesky factorization A = L*L'. */
334 
335  /* for each big-panel */
336  for( J=0; J<n; J+=NB ) {
337  JB = min(NB,n-J);
338  if( num_gpus0 > (n-J)/nb ) {
339  num_gpus = (n-J)/nb;
340  if( (n-J)%nb != 0 ) num_gpus ++;
341  } else {
342  num_gpus = num_gpus0;
343  }
344 
345  /* load the new big-panel by block-columns */
346  magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
347 
348  /* update with the previous big-panels */
349 #ifdef ROW_MAJOR_PROFILE
350  start = get_current_time();
351 #endif
352  for( j=0; j<J; j+=nb ) {
353  /* upload the diagonal of big panel */
354  for( d=0; d<num_gpus; d++ ) {
355  magma_setdevice(d);
356  magma_csetmatrix_async( JB, nb,
357  A(J, j), lda,
358  dT(d, J, 0), ldda, stream[d][0] );
359  n_local[d] = 0;
360  }
361 
362  /* upload off-diagonals */
363  for( jj=J+JB; jj<n; jj+=nb ) {
364  d = ((jj-J)/nb)%num_gpus;
365  magma_setdevice(d);
366 
367  jb2 = min(nb, n-jj);
368  magma_csetmatrix_async( jb2, nb,
369  A(jj, j), lda,
370  dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] );
371  n_local[d] += jb2;
372  }
373 
374  /* update the current big-panel using the previous block-row */
375  jb3 = nb; //min(nb,J-j);
376  for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
377  d = (jj/nb)%num_gpus;
378  magma_setdevice(d);
379 
380  J2 = (jj/(nb*num_gpus))*nb;
381  jb1 = min(JB,jj+nb);
382  jb2 = min(nb,JB-jj);
383  jb = jj; //jb1-jb2;
385  jb2, jb, nb,
386  c_neg_one, dT(d, J+jb, 0), ldda,
387  dT(d, J, 0), ldda,
388  c_one, dA(d, J2, 0), lddla);
389 
391  d_neg_one, dT(d, J+jb, 0), ldda,
392  d_one, dA(d, J2, jb ), lddla);
393  }
394 
395  if( n > J+JB ) { /* off-diagonal */
396  for( d=0; d<num_gpus; d++ ) {
397  magma_setdevice(d);
398  /* local number of columns in the big panel */
399  n_local[d] = (((n-J)/nb)/num_gpus)*nb;
400  if (d < ((n-J)/nb)%num_gpus)
401  n_local[d] += nb;
402  else if (d == ((n-J)/nb)%num_gpus)
403  n_local[d] += (n-J)%nb;
404 
405  /* local number of columns in diagonal */
406  n_local[d] -= ((JB/nb)/num_gpus)*nb;
407  if (d < (JB/nb)%num_gpus)
408  n_local[d] -= nb;
409 
410  J2 = nb*(JB/(nb*num_gpus));
411  if( d < (JB/nb)%num_gpus ) J2+=nb;
412 
414  n_local[d], JB, nb,
415  c_neg_one, dT(d, J+JB, 0), ldda,
416  dT(d, J, 0), ldda,
417  c_one, dA(d, J2, 0), lddla);
418  }
419  }
420  }
421  /* factor the big panel */
422  magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
423  if( iinfo != 0 ) {
424  *info = J+iinfo;
425  break;
426  }
427 #ifdef ROW_MAJOR_PROFILE
428  end = get_current_time();
429  chol_time += GetTimerValue(start, end);
430 #endif
431  /* upload the off-diagonal big panel */
432  //magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, NB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
433  magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
434 
435  } /* end of for J */
436  } /* if upper */
437  } /* if nb */
438 #ifdef ROW_MAJOR_PROFILE
439  end0 = get_current_time();
440 #endif
441  if( num_gpus0 > n/nb ) {
442  num_gpus = n/nb;
443  if( n%nb != 0 ) num_gpus ++;
444  } else {
445  num_gpus = num_gpus0;
446  }
447  for (d=0; d<num_gpus; d++ ) {
448  magma_setdevice(d);
449 
450  magma_free( dt[d] );
451  magma_queue_destroy( stream[d][0] );
452  magma_queue_destroy( stream[d][1] );
453  magma_queue_destroy( stream[d][2] );
454  }
455  magma_setdevice(0);
456  magma_free_host( work );
457 
458 #ifdef ROW_MAJOR_PROFILE
459  printf("\n n=%d NB=%d nb=%d\n",n,NB,nb);
460  printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS((float)n)/1000000, GetTimerValue(start0, end0),
461  FLOPS((float)n)/(1000000*GetTimerValue(start0, end0)));
462  printf(" Performance %f / %f = %f GFlop/s\n", FLOPS((float)n)/1000000, chol_time, FLOPS( (float)n ) / (1000000*chol_time));
463 #endif
464  return *info;
465 } /* magma_cpotrf_ooc */
466 
467 #undef A
468 #undef dA
469 #undef dT
470 #undef dAup
471 #undef dTup