11 #include "common_magma.h"
15 #if (defined(PRECISION_s) || defined(PRECISION_d))
16 #define magma_dgemm magmablas_dgemm
17 #define magma_dtrsm magmablas_dtrsm
21 #if (defined(PRECISION_s))
23 #define magma_sgemm magmablas_sgemm_fermi80
28 #include "../testing/flops.h"
30 #if defined(PRECISION_z) || defined(PRECISION_c)
31 #define FLOPS(n) ( 6. * FMULS_POTRF(n) + 2. * FADDS_POTRF(n) )
33 #define FLOPS(n) ( FMULS_POTRF(n) + FADDS_POTRF(n) )
51 #define A(i, j) (a +(j)*lda + (i))
52 #define dA(d, i, j) (dwork[(d)]+(j)*lddla + (i))
53 #define dT(d, i, j) (dt[(d)] +(j)*ldda + (i))
54 #define dAup(d, i, j) (dwork[(d)]+(j)*NB + (i))
55 #define dTup(d, i, j) (dt[(d)] +(j)*nb + (i))
126 char uplo_[2] = {
uplo, 0};
127 magma_int_t ldda, lddla, ldwrk, nb, iinfo, n_local[4], J2, d, num_gpus;
130 float d_neg_one = -1.0;
132 #if CUDA_VERSION > 3010
135 unsigned int totalMem;
138 static cudaStream_t stream[4][3];
140 #ifdef ROW_MAJOR_PROFILE
142 float chol_time = 1.0;
149 }
else if (lda <
max(1,n)) {
162 if( num_gpus0 > n/nb ) {
164 if( n%nb != 0 ) num_gpus ++;
166 num_gpus = num_gpus0;
168 ldda = n/(nb*num_gpus);
169 if( n%(nb*num_gpus) != 0 ) ldda++;
170 ldda = num_gpus*((nb*ldda+31)/32)*32;
173 cuDeviceGet( &dev, 0);
174 cuDeviceTotalMem( &totalMem, dev );
175 totalMem /=
sizeof(cuFloatComplex);
177 NB = (
magma_int_t)(num_gpus*(0.8*totalMem/ldda-2*nb));
179 #ifdef CHECK_CPOTRF_OOC
180 printf(
" * still fit in GPU memory.\n" );
184 #ifdef CHECK_CPOTRF_OOC
185 printf(
" * don't fit in GPU memory.\n" );
189 #ifdef CHECK_CPOTRF_OOC
190 if( NB != n ) printf(
" * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
191 else printf(
" * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
194 ldda = ((n+31)/32)*32;
195 lddla = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
196 for (d=0; d<num_gpus; d++ ) {
202 dwork[d] = &dt[d][2*nb*ldda];
207 #ifdef ROW_MAJOR_PROFILE
217 if (nb <= 1 || nb >= n) {
229 for( J=0; J<n; J+=
NB ) {
232 if( num_gpus0 > (n-J)/nb ) {
234 if( (n-J)%nb != 0 ) num_gpus ++;
236 num_gpus = num_gpus0;
240 magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
242 #ifdef ROW_MAJOR_PROFILE
246 for( j=0; j<J; j+=nb ) {
248 for( d=0; d<num_gpus; d++ ) {
252 dTup(d, 0, J), nb, stream[d][0] );
257 for( jj=J+JB; jj<n; jj+=nb ) {
258 d = ((jj-J)/nb)%num_gpus;
264 dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] );
270 for( jj=0; jj<JB; jj+=nb ) {
271 d = (jj/nb)%num_gpus;
274 J2 = (jj/(nb*num_gpus))*nb;
280 c_neg_one,
dTup(d, 0, J ), nb,
281 dTup(d, 0, J+jb), nb,
282 c_one,
dAup(d, 0, J2), NB);
285 d_neg_one,
dTup(d, 0, J+jb), nb,
286 d_one,
dAup(d, jb, J2), NB);
290 for( d=0; d<num_gpus; d++ ) {
293 n_local[d] = (((n-J)/nb)/num_gpus)*nb;
294 if (d < ((n-J)/nb)%num_gpus)
296 else if (d == ((n-J)/nb)%num_gpus)
297 n_local[d] += (n-J)%nb;
300 n_local[d] -= ((JB/nb)/num_gpus)*nb;
301 if (d < (JB/nb)%num_gpus)
304 J2 = nb*(JB/(nb*num_gpus));
305 if( d < (JB/nb)%num_gpus ) J2+=nb;
309 c_neg_one,
dTup(d, 0, J ), nb,
310 dTup(d, 0, J+JB), nb,
311 c_one,
dAup(d, 0, J2), NB);
317 magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
322 #ifdef ROW_MAJOR_PROFILE
328 magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
336 for( J=0; J<n; J+=
NB ) {
338 if( num_gpus0 > (n-J)/nb ) {
340 if( (n-J)%nb != 0 ) num_gpus ++;
342 num_gpus = num_gpus0;
346 magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
349 #ifdef ROW_MAJOR_PROFILE
352 for( j=0; j<J; j+=nb ) {
354 for( d=0; d<num_gpus; d++ ) {
358 dT(d, J, 0), ldda, stream[d][0] );
363 for( jj=J+JB; jj<n; jj+=nb ) {
364 d = ((jj-J)/nb)%num_gpus;
370 dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] );
376 for( jj=0; jj<JB; jj+=nb ) {
377 d = (jj/nb)%num_gpus;
380 J2 = (jj/(nb*num_gpus))*nb;
386 c_neg_one,
dT(d, J+jb, 0), ldda,
388 c_one,
dA(d, J2, 0), lddla);
391 d_neg_one,
dT(d, J+jb, 0), ldda,
392 d_one,
dA(d, J2, jb ), lddla);
396 for( d=0; d<num_gpus; d++ ) {
399 n_local[d] = (((n-J)/nb)/num_gpus)*nb;
400 if (d < ((n-J)/nb)%num_gpus)
402 else if (d == ((n-J)/nb)%num_gpus)
403 n_local[d] += (n-J)%nb;
406 n_local[d] -= ((JB/nb)/num_gpus)*nb;
407 if (d < (JB/nb)%num_gpus)
410 J2 = nb*(JB/(nb*num_gpus));
411 if( d < (JB/nb)%num_gpus ) J2+=nb;
415 c_neg_one,
dT(d, J+JB, 0), ldda,
417 c_one,
dA(d, J2, 0), lddla);
422 magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
427 #ifdef ROW_MAJOR_PROFILE
433 magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
438 #ifdef ROW_MAJOR_PROFILE
441 if( num_gpus0 > n/nb ) {
443 if( n%nb != 0 ) num_gpus ++;
445 num_gpus = num_gpus0;
447 for (d=0; d<num_gpus; d++ ) {
458 #ifdef ROW_MAJOR_PROFILE
459 printf(
"\n n=%d NB=%d nb=%d\n",n,NB,nb);
460 printf(
" Without memory allocation: %f / %f = %f GFlop/s\n",
FLOPS((
float)n)/1000000,
GetTimerValue(start0, end0),
462 printf(
" Performance %f / %f = %f GFlop/s\n",
FLOPS((
float)n)/1000000, chol_time,
FLOPS( (
float)n ) / (1000000*chol_time));