MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dsyr2k_mgpu.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @generated d Tue Aug 13 16:45:24 2013
9  @author Mark Gates
10  @author Azzam Haidar
11 */
12 #include "common_magma.h"
13 
14 /*
15  Purpose
16  =======
17  DSYR2K performs one of the symmetric rank 2k operations
18 
19  C := alpha*A*B**T + conjg( alpha )*B*A**T + beta*C,
20 
21  or
22 
23  C := alpha*A**T*B + conjg( alpha )*B**T*A + beta*C,
24 
25  where alpha and beta are scalars with beta real, C is an n by n
26  symmetric matrix and A and B are n by k matrices in the first case
27  and k by n matrices in the second case.
28 
29  Arguments
30  ==========
31 
32  UPLO (input) CHARACTER*1.
33  On entry, UPLO specifies whether the upper or lower
34  triangular part of the array C is to be referenced as
35  follows:
36 
37  UPLO = 'U' or 'u' Only the upper triangular part of C
38  is to be referenced.
39 
40  UPLO = 'L' or 'l' Only the lower triangular part of C
41  is to be referenced.
42 
43  **** current only Lower case is implemented.
44 
45  TRANS (input) CHARACTER*1.
46  On entry, TRANS specifies the operation to be performed as
47  follows:
48 
49  TRANS = 'N' or 'n'
50  C := alpha*A*B**T + conjg( alpha )*B*A**T + beta*C.
51 
52  TRANS = 'C' or 'c'
53  C := alpha*A**T*B + conjg( alpha )*B**T*A + beta*C.
54 
55  **** current only NoTrans case is implemented.
56 
57  N (input) INTEGER.
58  On entry, N specifies the order of the matrix C. N must be
59  at least zero.
60 
61  K (input) INTEGER.
62  On entry with TRANS = 'N' or 'n', K specifies the number
63  of columns of the matrices A and B, and on entry with
64  TRANS = 'C' or 'c', K specifies the number of rows of the
65  matrices A and B. K must be at least zero.
66 
67  ALPHA (input) COMPLEX*16.
68  On entry, ALPHA specifies the scalar alpha.
69 
70  dA (input) COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is
71  k when TRANS = 'N' or 'n', and is n otherwise.
72  Before entry with TRANS = 'N' or 'n', the leading n by k
73  part of the array A must contain the matrix A, otherwise
74  the leading k by n part of the array A must contain the
75  matrix A.
76 
77  [TODO: describe distribution: duplicated on all GPUs.]
78 
79  LDA (input) INTEGER.
80  On entry, LDA specifies the first dimension of A as declared
81  in the calling (sub) program. When TRANS = 'N' or 'n'
82  then LDA must be at least max( 1, n ), otherwise LDA must
83  be at least max( 1, k ).
84 
85  AOFFSET (input) INTEGER
86  Row offset to start sub-matrix of dA. Uses dA(aoffset:aoffset+n, :).
87  0 <= aoffset < lda.
88 
89  dB (input) COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is
90  k when TRANS = 'N' or 'n', and is n otherwise.
91  Before entry with TRANS = 'N' or 'n', the leading n by k
92  part of the array B must contain the matrix B, otherwise
93  the leading k by n part of the array B must contain the
94  matrix B.
95 
96  [TODO: describe distribution: duplicated on all GPUs.]
97 
98  LDB (input) INTEGER.
99  On entry, LDB specifies the first dimension of B as declared
100  in the calling (sub) program. When TRANS = 'N' or 'n'
101  then LDB must be at least max( 1, n ), otherwise LDB must
102  be at least max( 1, k ).
103 
104  BOFFSET (input) INTEGER
105  Row offset to start sub-matrix of dB. Uses dB(boffset:boffset+n, :).
106  0 <= boffset < ldb.
107 
108  BETA (input) DOUBLE PRECISION.
109  On entry, BETA specifies the scalar beta.
110 
111  dC (input/output) COMPLEX*16 array of DIMENSION ( LDC, n ).
112  Before entry with UPLO = 'U' or 'u', the leading n by n
113  upper triangular part of the array C must contain the upper
114  triangular part of the symmetric matrix and the strictly
115  lower triangular part of C is not referenced. On exit, the
116  upper triangular part of the array C is overwritten by the
117  upper triangular part of the updated matrix.
118 
119  Before entry with UPLO = 'L' or 'l', the leading n by n
120  lower triangular part of the array C must contain the lower
121  triangular part of the symmetric matrix and the strictly
122  upper triangular part of C is not referenced. On exit, the
123  lower triangular part of the array C is overwritten by the
124  lower triangular part of the updated matrix.
125 
126  Note that the imaginary parts of the diagonal elements need
127  not be set, they are assumed to be zero, and on exit they
128  are set to zero. [TODO: verify]
129 
130  [TODO: describe distribution: 1D column block-cyclic across GPUs.]
131 
132  LDC (input) INTEGER.
133  On entry, LDC specifies the first dimension of C as declared
134  in the calling (sub) program. LDC must be at least max( 1, n ).
135 
136  COFFSET (input) INTEGER.
137  Row and column offset to start sub-matrix of dC.
138  Uses dC(coffset:coffset+n, coffset:coffset+n).
139  0 <= coffset < ldc.
140 
141  NGPU (input) INTEGER.
142  Number of GPUs over which matrix C is distributed.
143 
144  NB (input) INTEGER.
145  Block size used for distribution of C.
146 
147  STREAMS (input) array of CUDA streams, of dimension NGPU by 20.
148  Streams to use for running multiple GEMMs in parallel.
149  Only up to NSTREAM streams are used on each GPU.
150 
151  NSTREAM (input) INTEGER.
152  Number of streams to use on each device
153 
154 */
155 
156 extern "C"
158  char uplo, char trans, magma_int_t n, magma_int_t k,
159  double alpha, double *dA[], magma_int_t lda, magma_int_t aoffset,
160  double *dB[], magma_int_t ldb, magma_int_t boffset,
161  double beta, double *dC[], magma_int_t ldc, magma_int_t coffset,
162  magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream )
163 {
164  #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
165  #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
166  #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
167 
168  /* Check arguments */
169  magma_int_t info = 0;
170  if ( ! (uplo == 'l' || uplo == 'L')) {
171  info = -1; // 'u' not yet handled
172  } else if ( ! (trans == 'n' || trans == 'N')) {
173  info = -2; // 'c' not yet handled
174  } else if ( n < 0 ) {
175  info = -3;
176  } else if ( k < 0 ) {
177  info = -4;
178  } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) ||
179  ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) {
180  info = -7;
181  } else if ( aoffset < 0 || aoffset > lda ) {
182  info = -8;
183  } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) ||
184  ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) {
185  info = -10;
186  } else if ( boffset < 0 || boffset > ldb ) {
187  info = -11;
188  } else if ( ldc < max(1,n) ) {
189  info = -13;
190  } else if ( coffset < 0 || coffset > ldc ) {
191  info = -14;
192  } else if ( ngpu <= 0 ) {
193  info = -15;
194  } else if ( nb <= 0 ) {
195  info = -16;
196  } else if ( nstream <= 0 ) {
197  info = -18;
198  }
199  if ( info != 0 ) {
200  magma_xerbla( __func__, -(info) );
201  return;
202  }
203 
204  const double c_one = MAGMA_D_ONE;
205  double cbeta = MAGMA_D_MAKE( beta, 0. );
206 
207  magma_int_t ib, ioff, iblock, idev, di, s;
208 
209  magma_device_t cdev;
210  magma_queue_t cqueue;
211  magma_getdevice( &cdev );
212  magmablasGetKernelStream( &cqueue );
213 
214  // loop over all blocks
215  // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
216  // blockoffset is offset within first block; for subsequent blocks it is 0
217  magma_int_t blockoffset = coffset % nb;
218  for( magma_int_t i = 0; i < n; i += ib ) {
219  ib = min( nb-blockoffset, n-i ); // block size
220  ioff = i + coffset; // global index in parent matrix
221  iblock = (ioff / nb) / ngpu; // local block id
222  idev = (ioff / nb) % ngpu; // device with this block
223  di = iblock*nb + blockoffset; // local index in parent matrix
224 
225  magma_setdevice( idev );
226  s = iblock % nstream;
227  magmablasSetKernelStream( streams[ idev ][ s ] );
228 
229  // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
230  //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
231  magma_dgemm( MagmaNoTrans, MagmaTrans, n-i, ib, k,
232  alpha, dA(idev,i,0), lda,
233  dB(idev,i,0), ldb,
234  cbeta, dC(idev,ioff,di), ldc );
235  blockoffset = 0;
236  }
237 
238  // second loop does C = (alpha)*B*A' + C_hat
239  alpha = MAGMA_D_CNJG( alpha );
240  blockoffset = coffset % nb;
241  for( magma_int_t i = 0; i < n; i += ib ) {
242  ib = min( nb-blockoffset, n-i ); // block size
243  ioff = i + coffset; // global index in parent matrix
244  iblock = (ioff / nb) / ngpu; // local block id
245  idev = (ioff / nb) % ngpu; // device with this block
246  di = iblock*nb + blockoffset; // local index in parent matrix
247 
248  magma_setdevice( idev );
249  s = iblock % nstream;
250  magmablasSetKernelStream( streams[ idev ][ s ] );
251 
252  // C[i:n,i] += (alpha) * B[i:n,0] * A[i,0]'
253  //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
254  magma_dgemm( MagmaNoTrans, MagmaTrans, n-i, ib, k,
255  alpha, dB(idev,i,0), ldb,
256  dA(idev,i,0), lda,
257  c_one, dC(idev,ioff,di), ldc );
258  blockoffset = 0;
259  }
260 
261  magma_setdevice( cdev );
262  magmablasSetKernelStream( cqueue );
263 }
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define __func__
Definition: common_magma.h:65
#define MAGMA_D_CNJG(v, t)
Definition: magma.h:164
int magma_int_t
Definition: magmablas.h:12
#define dC(dev, i, j)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define MAGMA_D_MAKE(r, i)
Definition: magma.h:167
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define dB(dev, i, j)
#define MagmaTrans
Definition: magma.h:58
void magmablas_dsyr2k_mgpu2(magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_ptr dA[], magma_int_t ldda, magma_int_t aoff, magmaDouble_ptr dB[], magma_int_t lddb, magma_int_t boff, double beta, magmaDouble_ptr dC[], magma_int_t lddc, magma_int_t offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define dA(dev, i, j)