MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dsyr2k_mgpu_spec.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @generated d Tue Aug 13 16:45:25 2013
9  @author Mark Gates
10  @author Azzam Haidar
11 */
12 #include "common_magma.h"
13 
14 /*
15  Purpose
16  =======
17  DSYR2K performs one of the symmetric rank 2k operations
18 
19  C := alpha*A*B**T + conjg( alpha )*B*A**T + beta*C,
20 
21  or
22 
23  C := alpha*A**T*B + conjg( alpha )*B**T*A + beta*C,
24 
25  where alpha and beta are scalars with beta real, C is an n by n
26  symmetric matrix and A and B are n by k matrices in the first case
27  and k by n matrices in the second case.
28 
29  This version assumes C has been symmetrized, so both upper and lower are
30  stored, and it maintains the symmetry, doing twice the operations.
31 
32  Arguments
33  ==========
34 
35  UPLO (input) CHARACTER*1.
36  On entry, UPLO specifies whether the upper or lower
37  triangular part of the array C is to be referenced as
38  follows:
39 
40  UPLO = 'U' or 'u' Only the upper triangular part of C
41  is to be referenced.
42 
43  UPLO = 'L' or 'l' Only the lower triangular part of C
44  is to be referenced.
45 
46  **** current only Lower case is implemented.
47 
48  TRANS (input) CHARACTER*1.
49  On entry, TRANS specifies the operation to be performed as
50  follows:
51 
52  TRANS = 'N' or 'n'
53  C := alpha*A*B**T + conjg( alpha )*B*A**T + beta*C.
54 
55  TRANS = 'C' or 'c'
56  C := alpha*A**T*B + conjg( alpha )*B**T*A + beta*C.
57 
58  **** current only NoTrans case is implemented.
59 
60  N (input) INTEGER.
61  On entry, N specifies the order of the matrix C. N must be
62  at least zero.
63 
64  K (input) INTEGER.
65  On entry with TRANS = 'N' or 'n', K specifies the number
66  of columns of the matrices A and B, and on entry with
67  TRANS = 'C' or 'c', K specifies the number of rows of the
68  matrices A and B. K must be at least zero.
69 
70  ALPHA (input) COMPLEX*16.
71  On entry, ALPHA specifies the scalar alpha.
72 
73  dA (input) COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is
74  k when TRANS = 'N' or 'n', and is n otherwise.
75  Before entry with TRANS = 'N' or 'n', the leading n by k
76  part of the array A must contain the matrix A, otherwise
77  the leading k by n part of the array A must contain the
78  matrix A.
79 
80  [TODO: describe distribution: duplicated on all GPUs.]
81 
82  LDA (input) INTEGER.
83  On entry, LDA specifies the first dimension of A as declared
84  in the calling (sub) program. When TRANS = 'N' or 'n'
85  then LDA must be at least max( 1, n ), otherwise LDA must
86  be at least max( 1, k ).
87 
88  AOFFSET (input) INTEGER
89  Row offset to start sub-matrix of dA. Uses dA(aoffset:aoffset+n, :).
90  0 <= aoffset < lda.
91 
92  dB (input) COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is
93  k when TRANS = 'N' or 'n', and is n otherwise.
94  Before entry with TRANS = 'N' or 'n', the leading n by k
95  part of the array B must contain the matrix B, otherwise
96  the leading k by n part of the array B must contain the
97  matrix B.
98 
99  [TODO: describe distribution: duplicated on all GPUs.]
100 
101  LDB (input) INTEGER.
102  On entry, LDB specifies the first dimension of B as declared
103  in the calling (sub) program. When TRANS = 'N' or 'n'
104  then LDB must be at least max( 1, n ), otherwise LDB must
105  be at least max( 1, k ).
106 
107  BOFFSET (input) INTEGER
108  Row offset to start sub-matrix of dB. Uses dB(boffset:boffset+n, :).
109  0 <= boffset < ldb.
110 
111  BETA (input) DOUBLE PRECISION.
112  On entry, BETA specifies the scalar beta.
113 
114  dC (input/output) COMPLEX*16 array of DIMENSION ( LDC, n ).
115  Before entry with UPLO = 'U' or 'u', the leading n by n
116  upper triangular part of the array C must contain the upper
117  triangular part of the symmetric matrix and the strictly
118  lower triangular part of C is not referenced. On exit, the
119  upper triangular part of the array C is overwritten by the
120  upper triangular part of the updated matrix.
121 
122  Before entry with UPLO = 'L' or 'l', the leading n by n
123  lower triangular part of the array C must contain the lower
124  triangular part of the symmetric matrix and the strictly
125  upper triangular part of C is not referenced. On exit, the
126  lower triangular part of the array C is overwritten by the
127  lower triangular part of the updated matrix.
128 
129  Note that the imaginary parts of the diagonal elements need
130  not be set, they are assumed to be zero, and on exit they
131  are set to zero. [TODO: verify]
132 
133  [TODO: describe distribution: 1D column block-cyclic across GPUs.]
134 
135  LDC (input) INTEGER.
136  On entry, LDC specifies the first dimension of C as declared
137  in the calling (sub) program. LDC must be at least max( 1, n ).
138 
139  COFFSET (input) INTEGER.
140  Row and column offset to start sub-matrix of dC.
141  Uses dC(coffset:coffset+n, coffset:coffset+n).
142  0 <= coffset < ldc.
143 
144  NGPU (input) INTEGER.
145  Number of GPUs over which matrix C is distributed.
146 
147  NB (input) INTEGER.
148  Block size used for distribution of C.
149 
150  STREAMS (input) array of CUDA streams, of dimension NGPU by 20.
151  Streams to use for running multiple GEMMs in parallel.
152  Only up to NSTREAM streams are used on each GPU.
153 
154  NSTREAM (input) INTEGER.
155  Number of streams to use on each device
156 
157 */
158 
159 extern "C"
161  char uplo, char trans, magma_int_t n, magma_int_t k,
162  double alpha, double *dA[], magma_int_t lda, magma_int_t aoffset,
163  double *dB[], magma_int_t ldb, magma_int_t boffset,
164  double beta, double *dC[], magma_int_t ldc, magma_int_t coffset,
165  magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream )
166 {
167  #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
168  #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
169  #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
170 
171  /* Check arguments */
172  magma_int_t info = 0;
173  if ( ! (uplo == 'l' || uplo == 'L')) {
174  info = -1; // 'u' not yet handled
175  } else if ( ! (trans == 'n' || trans == 'N')) {
176  info = -2; // 'c' not yet handled
177  } else if ( n < 0 ) {
178  info = -3;
179  } else if ( k < 0 ) {
180  info = -4;
181  } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) ||
182  ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) {
183  info = -7;
184  } else if ( aoffset < 0 || aoffset > lda ) {
185  info = -8;
186  } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) ||
187  ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) {
188  info = -10;
189  } else if ( boffset < 0 || boffset > ldb ) {
190  info = -11;
191  } else if ( ldc < max(1,n) ) {
192  info = -13;
193  } else if ( coffset < 0 || coffset > ldc ) {
194  info = -14;
195  } else if ( ngpu <= 0 ) {
196  info = -15;
197  } else if ( nb <= 0 ) {
198  info = -16;
199  } else if ( nstream <= 0 ) {
200  info = -18;
201  }
202  if ( info != 0 ) {
203  magma_xerbla( __func__, -(info) );
204  return;
205  }
206 
207  const double c_one = MAGMA_D_ONE;
208  double cbeta = MAGMA_D_MAKE( beta, 0. );
209 
210  magma_int_t ib, ioff, iblock, idev, di, s;
211 
212  magma_device_t cdev;
213  magma_queue_t cqueue;
214  magma_getdevice( &cdev );
215  magmablasGetKernelStream( &cqueue );
216 
217  // loop over all blocks
218  // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
219  // blockoffset is offset within first block; for subsequent blocks it is 0
220  magma_int_t blockoffset = coffset % nb;
221  for( magma_int_t i = 0; i < n; i += ib ) {
222  ib = min( nb-blockoffset, n-i ); // block size
223  ioff = i + coffset; // global index in parent matrix
224  iblock = (ioff / nb) / ngpu; // local block id
225  idev = (ioff / nb) % ngpu; // device with this block
226  di = iblock*nb + blockoffset; // local index in parent matrix
227 
228  magma_setdevice( idev );
229  s = iblock % nstream;
230  magmablasSetKernelStream( streams[ idev ][ s ] );
231 
232  // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
233  //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
234  magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k,
235  alpha, dA(idev,0,0), lda,
236  dB(idev,i,0), ldb,
237  cbeta, dC(idev,coffset,di), ldc );
238  blockoffset = 0;
239  }
240 
241  // second loop does C = (alpha)*B*A' + C_hat
242  alpha = MAGMA_D_CNJG( alpha );
243  blockoffset = coffset % nb;
244  for( magma_int_t i = 0; i < n; i += ib ) {
245  ib = min( nb-blockoffset, n-i ); // block size
246  ioff = i + coffset; // global index in parent matrix
247  iblock = (ioff / nb) / ngpu; // local block id
248  idev = (ioff / nb) % ngpu; // device with this block
249  di = iblock*nb + blockoffset; // local index in parent matrix
250 
251  magma_setdevice( idev );
252  s = iblock % nstream;
253  magmablasSetKernelStream( streams[ idev ][ s ] );
254 
255  // C[i:n,i] += (alpha) * B[i:n,0] * A[i,0]'
256  //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
257  magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k,
258  alpha, dB(idev,0,0), ldb,
259  dA(idev,i,0), lda,
260  c_one, dC(idev,coffset,di), ldc );
261  blockoffset = 0;
262  }
263 
264  magma_setdevice( cdev );
265  magmablasSetKernelStream( cqueue );
266 }
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define __func__
Definition: common_magma.h:65
#define MAGMA_D_CNJG(v, t)
Definition: magma.h:164
int magma_int_t
Definition: magmablas.h:12
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define MAGMA_D_MAKE(r, i)
Definition: magma.h:167
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
void magmablas_dsyr2k_mgpu_spec(magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_ptr dA[], magma_int_t lda, magma_int_t aoff, magmaDouble_ptr dB[], magma_int_t ldb, magma_int_t boff, double beta, magmaDouble_ptr dC[], magma_int_t ldc, magma_int_t offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
#define dB(dev, i, j)
#define MagmaTrans
Definition: magma.h:58
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define dC(dev, i, j)
#define dA(dev, i, j)