MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
magmablas_c.h File Reference
#include "magma_types.h"
Include dependency graph for magmablas_c.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define COMPLEX
 
#define magma_csetvector(n, hx_src, incx, dy_dst, incy)   magma_csetvector_internal( n, hx_src, incx, dy_dst, incy, __func__, __FILE__, __LINE__ )
 
#define magma_cgetvector(n, dx_src, incx, hy_dst, incy)   magma_cgetvector_internal( n, dx_src, incx, hy_dst, incy, __func__, __FILE__, __LINE__ )
 
#define magma_csetvector_async(n, hx_src, incx, dy_dst, incy, queue)   magma_csetvector_async_internal( n, hx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )
 
#define magma_cgetvector_async(n, dx_src, incx, hy_dst, incy, queue)   magma_cgetvector_async_internal( n, dx_src, incx, hy_dst, incy, queue, __func__, __FILE__, __LINE__ )
 
#define magma_ccopyvector_async(n, dx_src, incx, dy_dst, incy, queue)   magma_ccopyvector_async_internal( n, dx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )
 
#define magma_ccopyvector_async(n, dx_src, incx, dy_dst, incy, queue)   magma_ccopyvector_async_internal( n, dx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )
 
#define magma_csetmatrix(m, n, hA_src, lda, dB_dst, lddb)   magma_csetmatrix_internal( m, n, hA_src, lda, dB_dst, lddb, __func__, __FILE__, __LINE__ )
 
#define magma_cgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)   magma_cgetmatrix_internal( m, n, dA_src, ldda, hB_dst, ldb, __func__, __FILE__, __LINE__ )
 
#define magma_ccopymatrix(m, n, dA_src, ldda, dB_dst, lddb)   magma_ccopymatrix_internal( m, n, dA_src, ldda, dB_dst, lddb, __func__, __FILE__, __LINE__ )
 
#define magma_csetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)   magma_csetmatrix_async_internal( m, n, hA_src, lda, dB_dst, lddb, queue, __func__, __FILE__, __LINE__ )
 
#define magma_cgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)   magma_cgetmatrix_async_internal( m, n, dA_src, ldda, hB_dst, ldb, queue, __func__, __FILE__, __LINE__ )
 
#define magma_ccopymatrix_async(m, n, dA_src, ldda, dB_dst, lddb, queue)   magma_ccopymatrix_async_internal( m, n, dA_src, ldda, dB_dst, lddb, queue, __func__, __FILE__, __LINE__ )
 

Functions

float cpu_gpu_cdiff (magma_int_t m, magma_int_t n, const magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_const_ptr dA, magma_int_t ldda)
 
void czero_32x32_block (magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void czero_nbxnb_block (magma_int_t nb, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magmablas_cpermute_long2 (magma_int_t n, magmaFloatComplex_ptr dAT, magma_int_t ldda, magma_int_t *ipiv, magma_int_t nb, magma_int_t ind)
 
void magmablas_cpermute_long3 (magmaFloatComplex_ptr dAT, magma_int_t ldda, const magma_int_t *ipiv, magma_int_t nb, magma_int_t ind)
 
void magmablas_ctranspose_inplace (magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magmablas_ctranspose (magmaFloatComplex_ptr odata, magma_int_t ldo, magmaFloatComplex_const_ptr idata, magma_int_t ldi, magma_int_t m, magma_int_t n)
 
void magmablas_ctranspose2 (magmaFloatComplex_ptr odata, magma_int_t ldo, magmaFloatComplex_const_ptr idata, magma_int_t ldi, magma_int_t m, magma_int_t n)
 
void magmablas_ctranspose2s (magmaFloatComplex_ptr odata, magma_int_t ldo, magmaFloatComplex_const_ptr idata, magma_int_t ldi, magma_int_t m, magma_int_t n, magma_queue_t stream)
 
void magmablas_cgetmatrix_transpose (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dAT, magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dwork, magma_int_t lddwork, magma_int_t nb)
 
void magmablas_csetmatrix_transpose (magma_int_t m, magma_int_t n, const magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dAT, magma_int_t ldda, magmaFloatComplex_ptr dwork, magma_int_t lddwork, magma_int_t nb)
 
void magmablas_cgetmatrix_transpose_mgpu (magma_int_t ngpu, magma_queue_t stream[][2], magmaFloatComplex_ptr dAT[], magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dB[], magma_int_t lddb, magma_int_t m, magma_int_t n, magma_int_t nb)
 
void magmablas_csetmatrix_transpose_mgpu (magma_int_t ngpu, magma_queue_t stream[][2], const magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dAT[], magma_int_t ldda, magmaFloatComplex_ptr dB[], magma_int_t lddb, magma_int_t m, magma_int_t n, magma_int_t nb)
 
void magma_cgetmatrix_1D_col_bcyclic (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA[], magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magma_int_t ngpu, magma_int_t nb)
 
void magma_csetmatrix_1D_col_bcyclic (magma_int_t m, magma_int_t n, const magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t ngpu, magma_int_t nb)
 
void magma_cgetmatrix_1D_row_bcyclic (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA[], magma_int_t ldda, magmaFloatComplex *hA, magma_int_t lda, magma_int_t ngpu, magma_int_t nb)
 
void magma_csetmatrix_1D_row_bcyclic (magma_int_t m, magma_int_t n, const magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t ngpu, magma_int_t nb)
 
magma_int_t magma_chtodhe (magma_int_t num_gpus, char *uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex **dwork, magma_int_t ldda, magma_queue_t stream[][10], magma_int_t *info)
 
magma_int_t magma_chtodpo (magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloatComplex *h_A, magma_int_t lda, magmaFloatComplex *d_lA[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
 
magma_int_t magma_cdtohpo (magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *work[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
 
magma_int_t magmablas_chemv_mgpu_offset (char uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex **A, magma_int_t lda, magmaFloatComplex **X, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex **Y, magma_int_t incy, magmaFloatComplex **work, magma_int_t lwork, magma_int_t num_gpus, magma_int_t nb, magma_int_t offset, magma_queue_t stream[][10])
 
magma_int_t magmablas_chemv_mgpu_32_offset (char uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex **A, magma_int_t lda, magmaFloatComplex **X, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex **Y, magma_int_t incy, magmaFloatComplex **work, magma_int_t lwork, magma_int_t num_gpus, magma_int_t nb, magma_int_t offset, magma_queue_t stream[][10])
 
magma_int_t magmablas_chemv_mgpu (magma_int_t num_gpus, magma_int_t k, char uplo, magma_int_t n, magma_int_t nb, magmaFloatComplex alpha, magmaFloatComplex **da, magma_int_t ldda, magma_int_t offset, magmaFloatComplex **dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex **dy, magma_int_t incy, magmaFloatComplex **dwork, magma_int_t ldwork, magmaFloatComplex *work, magmaFloatComplex *w, magma_queue_t stream[][10])
 
magma_int_t magmablas_chemv_sync (magma_int_t num_gpus, magma_int_t k, magma_int_t n, magmaFloatComplex *work, magmaFloatComplex *w, magma_queue_t stream[][10])
 
void magmablas_chemm_1gpu_old (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex *C, magma_int_t ldc, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
 
void magmablas_chemm_1gpu (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex *C, magma_int_t ldc, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
 
void magmablas_chemm_mgpu (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex_ptr dwork[], magma_int_t lddwork, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][20], magma_int_t nbevents)
 
void magmablas_chemm_mgpu_com (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex_ptr dwork[], magma_int_t lddwork, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx)
 
void magmablas_chemm_mgpu_spec (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex_ptr dwork[], magma_int_t lddwork, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx)
 
void magmablas_chemm_mgpu_spec33 (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magmaFloatComplex_ptr dVIN[], magma_int_t lddv, magma_int_t voffst, magmaFloatComplex_ptr dwork[], magma_int_t lddwork, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx)
 
void magma_cher2k_mgpu (magma_int_t num_gpus, char uplo, char trans, magma_int_t nb, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex **db, magma_int_t lddb, magma_int_t boffset, float beta, magmaFloatComplex **dc, magma_int_t lddc, magma_int_t offset, magma_int_t num_streams, magma_queue_t streams[][10])
 
void magmablas_cher2k_mgpu2 (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t aoff, magmaFloatComplex_ptr dB[], magma_int_t lddb, magma_int_t boff, float beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magma_int_t offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
 
void magmablas_cher2k_mgpu_spec (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t lda, magma_int_t aoff, magmaFloatComplex_ptr dB[], magma_int_t ldb, magma_int_t boff, float beta, magmaFloatComplex_ptr dC[], magma_int_t ldc, magma_int_t offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream)
 
void magmablas_cher2k_mgpu_spec324 (magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dVIN[], magma_int_t lddv, magma_int_t voff, magmaFloatComplex_ptr dWIN[], magma_int_t lddw, magma_int_t woff, float beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magma_int_t offset, magmaFloatComplex_ptr dwork[], magma_int_t lndwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents)
 
void magmablas_cher2k_mgpu_spec325 (magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dVIN[], magma_int_t lddv, magma_int_t voff, magmaFloatComplex_ptr dWIN[], magma_int_t lddw, magma_int_t woff, float beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magma_int_t offset, magmaFloatComplex_ptr dwork[], magma_int_t lndwork, magma_int_t ngpu, magma_int_t nb, magmaFloatComplex **harray[], magmaFloatComplex_ptr *darray[], magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents)
 
void magmablas_cgeadd (magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
 
void magmablas_cgeadd_batched (magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr const *dAarray, magma_int_t ldda, magmaFloatComplex_ptr *dBarray, magma_int_t lddb, magma_int_t batchCount)
 
void magmablas_clacpy (magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
 
void magmablas_clacpy_batched (magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr const *dAarray, magma_int_t ldda, magmaFloatComplex_ptr *dBarray, magma_int_t lddb, magma_int_t batchCount)
 
float magmablas_clange (magma_norm_t norm, magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloat_ptr dwork)
 
float magmablas_clanhe (magma_norm_t norm, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloat_ptr dwork)
 
float magmablas_clansy (magma_norm_t norm, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloat_ptr dwork)
 
void magmablas_clascl (char type, magma_int_t kl, magma_int_t ku, float cfrom, float cto, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info)
 
void magmablas_claset (magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magmablas_claset_identity (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magmablas_claswp (magma_int_t n, magmaFloatComplex_ptr dAT, magma_int_t ldda, magma_int_t i1, magma_int_t i2, const magma_int_t *ipiv, magma_int_t inci)
 
void magmablas_claswpx (magma_int_t n, magmaFloatComplex_ptr dAT, magma_int_t ldx, magma_int_t ldy, magma_int_t i1, magma_int_t i2, const magma_int_t *ipiv, magma_int_t inci)
 
void magmablas_claswp2 (magma_int_t n, magmaFloatComplex_ptr dAT, magma_int_t ldda, magma_int_t i1, magma_int_t i2, const magma_int_t *d_ipiv)
 
void magmablas_csymmetrize (magma_uplo_t uplo, magma_int_t m, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magmablas_csymmetrize_tiles (magma_uplo_t uplo, magma_int_t m, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t ntile, magma_int_t mstride, magma_int_t nstride)
 
void magma_clarfgx_gpu (magma_int_t n, magmaFloatComplex *dx0, magmaFloatComplex *dx, magmaFloatComplex *dtau, float *dxnorm, magmaFloatComplex *ddx0, magma_int_t iter)
 
void magma_clarfx_gpu (magma_int_t m, magma_int_t n, magmaFloatComplex *v, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, float *xnorm, magmaFloatComplex *dT, magma_int_t iter, magmaFloatComplex *work)
 
void magma_clarfbx_gpu (magma_int_t m, magma_int_t k, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *dT, magma_int_t ldt, magmaFloatComplex *c, magmaFloatComplex *dwork)
 
void magma_clarfgtx_gpu (magma_int_t n, magmaFloatComplex *dx0, magmaFloatComplex *dx, magmaFloatComplex *dtau, float *dxnorm, magmaFloatComplex *dA, magma_int_t it, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *dwork)
 
void magmablas_scnrm2_adjust (magma_int_t k, float *xnorm, magmaFloatComplex *c)
 
void magmablas_scnrm2_cols (magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloat_ptr dxnorm)
 
void magmablas_scnrm2_row_check_adjust (magma_int_t k, float tol, float *xnorm, float *xnorm2, magmaFloatComplex *c, magma_int_t ldc, float *lsticc)
 
void magmablas_scnrm2_check (magma_int_t m, magma_int_t num, magmaFloatComplex *da, magma_int_t ldda, float *dxnorm, float *lsticc)
 
void magmablas_cswap (magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
 
void magmablas_cswapblk (magma_storev_t storev, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t i1, magma_int_t i2, const magma_int_t *ipiv, magma_int_t inci, magma_int_t offset)
 
void magmablas_cswapdblk (magma_int_t n, magma_int_t nb, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t inca, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t incb)
 
void magmablas_cgemv (magma_trans_t trans, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy)
 
magma_int_t magmablas_chemv (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy)
 
magma_int_t magmablas_chemv2 (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dX, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dY, magma_int_t incy, magmaFloatComplex_ptr dwork, magma_int_t lwork)
 
magma_int_t magmablas_csymv (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy)
 
void magmablas_cgemm (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_cgemm_reduce (magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, const magmaFloatComplex *dA, magma_int_t lda, const magmaFloatComplex *dB, magma_int_t ldb, magmaFloatComplex beta, magmaFloatComplex *dC, magma_int_t ldc)
 
void magmablas_cgemm_fermi80 (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_cgemm_fermi64 (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_chemm (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_csymm (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_csyrk (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_cherk (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, float beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_csyr2k (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magmablas_cher2k (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, float beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_csetvector_internal (magma_int_t n, magmaFloatComplex const *hx_src, magma_int_t incx, magmaFloatComplex_ptr dy_dst, magma_int_t incy, const char *func, const char *file, int line)
 
void magma_cgetvector_internal (magma_int_t n, magmaFloatComplex_const_ptr dx_src, magma_int_t incx, magmaFloatComplex *hy_dst, magma_int_t incy, const char *func, const char *file, int line)
 
magma_err_t magma_ccopyvector_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dx_src, magma_int_t incx, magmaFloatComplex_ptr dy_dst, magma_int_t incy, const char *func, const char *file, int line)
 
void magma_csetvector_async_internal (magma_int_t n, magmaFloatComplex const *hx_src, magma_int_t incx, magmaFloatComplex_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char *func, const char *file, int line)
 
void magma_cgetvector_async_internal (magma_int_t n, magmaFloatComplex_const_ptr dx_src, magma_int_t incx, magmaFloatComplex *hy_dst, magma_int_t incy, magma_queue_t queue, const char *func, const char *file, int line)
 
magma_err_t magma_ccopyvector_async_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dx_src, magma_int_t incx, magmaFloatComplex_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char *func, const char *file, int line)
 
void magma_csetmatrix_internal (magma_int_t m, magma_int_t n, magmaFloatComplex const *hA_src, magma_int_t ldha, magmaFloatComplex_ptr dB_dst, magma_int_t lddb, const char *func, const char *file, int line)
 
void magma_cgetmatrix_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA_src, magma_int_t ldda, magmaFloatComplex *hB_dst, magma_int_t ldhb, const char *func, const char *file, int line)
 
void magma_ccopymatrix_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA_src, magma_int_t ldda, magmaFloatComplex_ptr dB_dst, magma_int_t lddb, const char *func, const char *file, int line)
 
void magma_csetmatrix_async_internal (magma_int_t m, magma_int_t n, magmaFloatComplex const *hA_src, magma_int_t ldha, magmaFloatComplex_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char *func, const char *file, int line)
 
void magma_cgetmatrix_async_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA_src, magma_int_t ldda, magmaFloatComplex *hB_dst, magma_int_t ldhb, magma_queue_t queue, const char *func, const char *file, int line)
 
void magma_ccopymatrix_async_internal (magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA_src, magma_int_t ldda, magmaFloatComplex_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char *func, const char *file, int line)
 
magma_int_t magma_icamax (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx)
 
magma_int_t magma_icamin (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx)
 
float magma_scasum (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx)
 
void magma_caxpy (magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dy, magma_int_t incy)
 
void magma_ccopy (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dy, magma_int_t incy)
 
magmaFloatComplex magma_cdotc (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_const_ptr dy, magma_int_t incy)
 
magmaFloatComplex magma_cdotu (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_const_ptr dy, magma_int_t incy)
 
float magma_scnrm2 (magma_int_t n, magmaFloatComplex_const_ptr dx, magma_int_t incx)
 
void magma_crot (magma_int_t n, magmaFloatComplex_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dy, magma_int_t incy, float dc, magmaFloatComplex ds)
 
void magma_csrot (magma_int_t n, magmaFloatComplex_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dy, magma_int_t incy, float dc, float ds)
 
void magma_cscal (magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_ptr dx, magma_int_t incx)
 
void magma_csscal (magma_int_t n, float alpha, magmaFloatComplex_ptr dx, magma_int_t incx)
 
void magma_cswap (magma_int_t n, magmaFloatComplex_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dy, magma_int_t incy)
 
void magma_cgemv (magma_trans_t transA, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy)
 
void magma_cgerc (magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_const_ptr dy, magma_int_t incy, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magma_cgeru (magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_const_ptr dy, magma_int_t incy, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magma_chemv (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, magma_int_t incy)
 
void magma_cher (magma_uplo_t uplo, magma_int_t n, float alpha, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magma_cher2 (magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dx, magma_int_t incx, magmaFloatComplex_const_ptr dy, magma_int_t incy, magmaFloatComplex_ptr dA, magma_int_t ldda)
 
void magma_ctrmv (magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dx, magma_int_t incx)
 
void magma_ctrsv (magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dx, magma_int_t incx)
 
void magma_cgemm (magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_csymm (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_csyrk (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_csyr2k (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_chemm (magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_cherk (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, float beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_cher2k (magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, float beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
 
void magma_ctrmm (magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
 
void magma_ctrsm (magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
 

Macro Definition Documentation

#define COMPLEX

Definition at line 16 of file magmablas_c.h.

#define magma_ccopymatrix (   m,
  n,
  dA_src,
  ldda,
  dB_dst,
  lddb 
)    magma_ccopymatrix_internal( m, n, dA_src, ldda, dB_dst, lddb, __func__, __FILE__, __LINE__ )

Definition at line 708 of file magmablas_c.h.

#define magma_ccopymatrix_async (   m,
  n,
  dA_src,
  ldda,
  dB_dst,
  lddb,
  queue 
)    magma_ccopymatrix_async_internal( m, n, dA_src, ldda, dB_dst, lddb, queue, __func__, __FILE__, __LINE__ )

Definition at line 717 of file magmablas_c.h.

#define magma_ccopyvector_async (   n,
  dx_src,
  incx,
  dy_dst,
  incy,
  queue 
)    magma_ccopyvector_async_internal( n, dx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )

Definition at line 649 of file magmablas_c.h.

#define magma_ccopyvector_async (   n,
  dx_src,
  incx,
  dy_dst,
  incy,
  queue 
)    magma_ccopyvector_async_internal( n, dx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )

Definition at line 649 of file magmablas_c.h.

#define magma_cgetmatrix (   m,
  n,
  dA_src,
  ldda,
  hB_dst,
  ldb 
)    magma_cgetmatrix_internal( m, n, dA_src, ldda, hB_dst, ldb, __func__, __FILE__, __LINE__ )

Definition at line 705 of file magmablas_c.h.

#define magma_cgetmatrix_async (   m,
  n,
  dA_src,
  ldda,
  hB_dst,
  ldb,
  queue 
)    magma_cgetmatrix_async_internal( m, n, dA_src, ldda, hB_dst, ldb, queue, __func__, __FILE__, __LINE__ )

Definition at line 714 of file magmablas_c.h.

#define magma_cgetvector (   n,
  dx_src,
  incx,
  hy_dst,
  incy 
)    magma_cgetvector_internal( n, dx_src, incx, hy_dst, incy, __func__, __FILE__, __LINE__ )

Definition at line 637 of file magmablas_c.h.

#define magma_cgetvector_async (   n,
  dx_src,
  incx,
  hy_dst,
  incy,
  queue 
)    magma_cgetvector_async_internal( n, dx_src, incx, hy_dst, incy, queue, __func__, __FILE__, __LINE__ )

Definition at line 643 of file magmablas_c.h.

#define magma_csetmatrix (   m,
  n,
  hA_src,
  lda,
  dB_dst,
  lddb 
)    magma_csetmatrix_internal( m, n, hA_src, lda, dB_dst, lddb, __func__, __FILE__, __LINE__ )

Definition at line 702 of file magmablas_c.h.

#define magma_csetmatrix_async (   m,
  n,
  hA_src,
  lda,
  dB_dst,
  lddb,
  queue 
)    magma_csetmatrix_async_internal( m, n, hA_src, lda, dB_dst, lddb, queue, __func__, __FILE__, __LINE__ )

Definition at line 711 of file magmablas_c.h.

#define magma_csetvector (   n,
  hx_src,
  incx,
  dy_dst,
  incy 
)    magma_csetvector_internal( n, hx_src, incx, dy_dst, incy, __func__, __FILE__, __LINE__ )

Definition at line 634 of file magmablas_c.h.

#define magma_csetvector_async (   n,
  hx_src,
  incx,
  dy_dst,
  incy,
  queue 
)    magma_csetvector_async_internal( n, hx_src, incx, dy_dst, incy, queue, __func__, __FILE__, __LINE__ )

Definition at line 640 of file magmablas_c.h.

Function Documentation

float cpu_gpu_cdiff ( magma_int_t  m,
magma_int_t  n,
const magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda 
)
void czero_32x32_block ( magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)
void czero_nbxnb_block ( magma_int_t  nb,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)

Here is the caller graph for this function:

void magma_caxpy ( magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)
void magma_ccopy ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)
void magma_ccopymatrix_async_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA_src,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB_dst,
magma_int_t  lddb,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
void magma_ccopymatrix_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA_src,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB_dst,
magma_int_t  lddb,
const char *  func,
const char *  file,
int  line 
)
magma_err_t magma_ccopyvector_async_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dx_src,
magma_int_t  incx,
magmaFloatComplex_ptr  dy_dst,
magma_int_t  incy,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
magma_err_t magma_ccopyvector_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dx_src,
magma_int_t  incx,
magmaFloatComplex_ptr  dy_dst,
magma_int_t  incy,
const char *  func,
const char *  file,
int  line 
)
magmaFloatComplex magma_cdotc ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_const_ptr  dy,
magma_int_t  incy 
)
magmaFloatComplex magma_cdotu ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_const_ptr  dy,
magma_int_t  incy 
)
magma_int_t magma_cdtohpo ( magma_int_t  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
magma_int_t  NB,
magmaFloatComplex *  a,
magma_int_t  lda,
magmaFloatComplex *  work[],
magma_int_t  ldda,
magma_queue_t  stream[][3],
magma_int_t info 
)

Definition at line 766 of file cpotrf3_mgpu.cpp.

References A, dA, lapackf77_lsame, magma_cgetmatrix_async, magma_queue_sync, magma_setdevice(), and min.

771 {
772  magma_int_t k;
773  if( lapackf77_lsame(uplo, "U") ) {
774  magma_int_t j, jj, jb, mj;
775 
776  /* go through each column */
777  for (j=off_j+NB; j<n; j+=nb) {
778  jj = (j-off_j)/(nb*num_gpus);
779  k = ((j-off_j)/nb)%num_gpus;
780 
781  jb = min(nb, (n-j));
782  if(j+jb < off_j+m)
783  mj = (j-off_i)+jb;
784  else
785  mj = m;
786 
787  magma_setdevice(k);
788  magma_cgetmatrix_async( mj, jb,
789  dA(k, 0, jj*nb), ldda,
790  A(off_i, j), lda,
791  stream[k][0] );
792  magma_queue_sync( stream[k][0] );
793  }
794  } else {
795  magma_int_t i, ii, ib, ni;
796 
797  /* go through each row */
798  for(i=off_i+NB; i<m; i+=nb){
799  ii = (i-off_i)/(nb*num_gpus);
800  k = ((i-off_i)/nb)%num_gpus;
801 
802  ib = min(nb, (m-i));
803  if(i+ib < off_i+n)
804  ni = (i-off_i)+ib;
805  else
806  ni = n;
807 
808  magma_setdevice(k);
809  magma_cgetmatrix_async( ib, ni,
810  dA(k, ii*nb, 0), ldda,
811  A(i, off_j), lda,
812  stream[k][0] );
813  magma_queue_sync( stream[k][0] );
814  }
815  }
816  /*for( k=0; k<num_gpus; k++ ) {
817  magma_setdevice(k);
818  magma_queue_sync( stream[k][0] );
819  }*/
820  magma_setdevice(0);
821 
822  return *info;
823 }
#define min(a, b)
Definition: common_magma.h:86
#define magma_cgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_c.h:714
int magma_int_t
Definition: magmablas.h:12
magma_int_t ldda
void magma_setdevice(magma_device_t dev)
#define dA(d, i, j)
#define lapackf77_lsame
Definition: magma_lapack.h:23
#define A(i, j)
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_cgemm ( magma_trans_t  transA,
magma_trans_t  transB,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)

Here is the caller graph for this function:

void magma_cgemv ( magma_trans_t  transA,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

void magma_cgerc ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_const_ptr  dy,
magma_int_t  incy,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)
void magma_cgeru ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_const_ptr  dy,
magma_int_t  incy,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)
void magma_cgetmatrix_1D_col_bcyclic ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magmaFloatComplex *  hA,
magma_int_t  lda,
magma_int_t  ngpu,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magma_cgetmatrix_1D_row_bcyclic ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magmaFloatComplex *  hA,
magma_int_t  lda,
magma_int_t  ngpu,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magma_cgetmatrix_async_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA_src,
magma_int_t  ldda,
magmaFloatComplex *  hB_dst,
magma_int_t  ldhb,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
void magma_cgetmatrix_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA_src,
magma_int_t  ldda,
magmaFloatComplex *  hB_dst,
magma_int_t  ldhb,
const char *  func,
const char *  file,
int  line 
)
void magma_cgetvector_async_internal ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx_src,
magma_int_t  incx,
magmaFloatComplex *  hy_dst,
magma_int_t  incy,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
void magma_cgetvector_internal ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx_src,
magma_int_t  incx,
magmaFloatComplex *  hy_dst,
magma_int_t  incy,
const char *  func,
const char *  file,
int  line 
)
void magma_chemm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)

Here is the caller graph for this function:

void magma_chemv ( magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

void magma_cher ( magma_uplo_t  uplo,
magma_int_t  n,
float  alpha,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)
void magma_cher2 ( magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_const_ptr  dy,
magma_int_t  incy,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)
void magma_cher2k ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
float  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)

Here is the caller graph for this function:

void magma_cher2k_mgpu ( magma_int_t  num_gpus,
char  uplo,
char  trans,
magma_int_t  nb,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex **  db,
magma_int_t  lddb,
magma_int_t  boffset,
float  beta,
magmaFloatComplex **  dc,
magma_int_t  lddc,
magma_int_t  offset,
magma_int_t  num_streams,
magma_queue_t  streams[][10] 
)

Here is the caller graph for this function:

void magma_cherk ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
float  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
float  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)

Here is the caller graph for this function:

magma_int_t magma_chtodhe ( magma_int_t  num_gpus,
char *  uplo,
magma_int_t  n,
magma_int_t  nb,
magmaFloatComplex *  a,
magma_int_t  lda,
magmaFloatComplex **  dwork,
magma_int_t  ldda,
magma_queue_t  stream[][10],
magma_int_t info 
)
magma_int_t magma_chtodpo ( magma_int_t  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
magmaFloatComplex *  h_A,
magma_int_t  lda,
magmaFloatComplex *  d_lA[],
magma_int_t  ldda,
magma_queue_t  stream[][3],
magma_int_t info 
)

Definition at line 707 of file cpotrf3_mgpu.cpp.

References A, dA, lapackf77_lsame, magma_csetmatrix_async, magma_queue_sync, magma_setdevice(), and min.

712 {
713  magma_int_t k;
714  if( lapackf77_lsame(uplo, "U") ) {
715  magma_int_t j, jj, jb, mj;
716 
717  /* go through each column */
718  for (j=off_j; j<n; j+=nb) {
719  jj = (j-off_j)/(nb*num_gpus);
720  k = ((j-off_j)/nb)%num_gpus;
721 
722  jb = min(nb, (n-j));
723  if(j+jb < off_j+m)
724  mj = (j-off_i)+jb;
725  else
726  mj = m;
727 
728  magma_setdevice(k);
729  magma_csetmatrix_async( mj, jb,
730  A(off_i, j), lda,
731  dA(k, 0, jj*nb), ldda,
732  stream[k][0] );
733  }
734  }
735  else {
736  magma_int_t i, ii, ib, ni;
737 
738  /* go through each row */
739  for(i=off_i; i<m; i+=nb){
740  ii = (i-off_i)/(nb*num_gpus);
741  k = ((i-off_i)/nb)%num_gpus;
742 
743  ib = min(nb, (m-i));
744  if(i+ib < off_i+n)
745  ni = (i-off_i)+ib;
746  else
747  ni = n;
748 
749  magma_setdevice(k);
750  magma_csetmatrix_async( ib, ni,
751  A(i, off_j), lda,
752  dA(k, ii*nb, 0), ldda,
753  stream[k][0] );
754  }
755  }
756  for( k=0; k<num_gpus; k++ ) {
757  magma_setdevice(k);
758  magma_queue_sync( stream[k][0] );
759  }
760  magma_setdevice(0);
761 
762  return *info;
763 }
#define min(a, b)
Definition: common_magma.h:86
int magma_int_t
Definition: magmablas.h:12
magma_int_t ldda
void magma_setdevice(magma_device_t dev)
#define dA(d, i, j)
#define lapackf77_lsame
Definition: magma_lapack.h:23
#define A(i, j)
#define magma_csetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_c.h:711
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function:

void magma_clarfbx_gpu ( magma_int_t  m,
magma_int_t  k,
magmaFloatComplex *  V,
magma_int_t  ldv,
magmaFloatComplex *  dT,
magma_int_t  ldt,
magmaFloatComplex *  c,
magmaFloatComplex *  dwork 
)

Here is the caller graph for this function:

void magma_clarfgtx_gpu ( magma_int_t  n,
magmaFloatComplex *  dx0,
magmaFloatComplex *  dx,
magmaFloatComplex *  dtau,
float *  dxnorm,
magmaFloatComplex *  dA,
magma_int_t  it,
magmaFloatComplex *  V,
magma_int_t  ldv,
magmaFloatComplex *  T,
magma_int_t  ldt,
magmaFloatComplex *  dwork 
)

Here is the caller graph for this function:

void magma_clarfgx_gpu ( magma_int_t  n,
magmaFloatComplex *  dx0,
magmaFloatComplex *  dx,
magmaFloatComplex *  dtau,
float *  dxnorm,
magmaFloatComplex *  ddx0,
magma_int_t  iter 
)

Here is the caller graph for this function:

void magma_clarfx_gpu ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex *  v,
magmaFloatComplex *  tau,
magmaFloatComplex *  c,
magma_int_t  ldc,
float *  xnorm,
magmaFloatComplex *  dT,
magma_int_t  iter,
magmaFloatComplex *  work 
)

Here is the caller graph for this function:

void magma_crot ( magma_int_t  n,
magmaFloatComplex_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dy,
magma_int_t  incy,
float  dc,
magmaFloatComplex  ds 
)
void magma_cscal ( magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dx,
magma_int_t  incx 
)
void magma_csetmatrix_1D_col_bcyclic ( magma_int_t  m,
magma_int_t  n,
const magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  ngpu,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magma_csetmatrix_1D_row_bcyclic ( magma_int_t  m,
magma_int_t  n,
const magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  ngpu,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magma_csetmatrix_async_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex const *  hA_src,
magma_int_t  ldha,
magmaFloatComplex_ptr  dB_dst,
magma_int_t  lddb,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
void magma_csetmatrix_internal ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex const *  hA_src,
magma_int_t  ldha,
magmaFloatComplex_ptr  dB_dst,
magma_int_t  lddb,
const char *  func,
const char *  file,
int  line 
)
void magma_csetvector_async_internal ( magma_int_t  n,
magmaFloatComplex const *  hx_src,
magma_int_t  incx,
magmaFloatComplex_ptr  dy_dst,
magma_int_t  incy,
magma_queue_t  queue,
const char *  func,
const char *  file,
int  line 
)
void magma_csetvector_internal ( magma_int_t  n,
magmaFloatComplex const *  hx_src,
magma_int_t  incx,
magmaFloatComplex_ptr  dy_dst,
magma_int_t  incy,
const char *  func,
const char *  file,
int  line 
)
void magma_csrot ( magma_int_t  n,
magmaFloatComplex_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dy,
magma_int_t  incy,
float  dc,
float  ds 
)
void magma_csscal ( magma_int_t  n,
float  alpha,
magmaFloatComplex_ptr  dx,
magma_int_t  incx 
)
void magma_cswap ( magma_int_t  n,
magmaFloatComplex_ptr  dx,
magma_int_t  incx,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

void magma_csymm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magma_csyr2k ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magma_csyrk ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magma_ctrmm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_trans_t  trans,
magma_diag_t  diag,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb 
)

Here is the caller graph for this function:

void magma_ctrmv ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_diag_t  diag,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dx,
magma_int_t  incx 
)
void magma_ctrsm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_trans_t  trans,
magma_diag_t  diag,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb 
)

Here is the caller graph for this function:

void magma_ctrsv ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_diag_t  diag,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dx,
magma_int_t  incx 
)

Here is the caller graph for this function:

magma_int_t magma_icamax ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx 
)

Here is the caller graph for this function:

magma_int_t magma_icamin ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx 
)
float magma_scasum ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx 
)
float magma_scnrm2 ( magma_int_t  n,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx 
)

Here is the caller graph for this function:

void magmablas_cgeadd ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb 
)

Here is the caller graph for this function:

void magmablas_cgeadd_batched ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr const *  dAarray,
magma_int_t  ldda,
magmaFloatComplex_ptr dBarray,
magma_int_t  lddb,
magma_int_t  batchCount 
)

Here is the caller graph for this function:

void magmablas_cgemm ( magma_trans_t  transA,
magma_trans_t  transB,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)

Here is the caller graph for this function:

void magmablas_cgemm_fermi64 ( magma_trans_t  transA,
magma_trans_t  transB,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_cgemm_fermi80 ( magma_trans_t  transA,
magma_trans_t  transB,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_cgemm_reduce ( magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
const magmaFloatComplex *  dA,
magma_int_t  lda,
const magmaFloatComplex *  dB,
magma_int_t  ldb,
magmaFloatComplex  beta,
magmaFloatComplex *  dC,
magma_int_t  ldc 
)

Here is the caller graph for this function:

void magmablas_cgemv ( magma_trans_t  trans,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

void magmablas_cgetmatrix_transpose ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dAT,
magma_int_t  ldda,
magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dwork,
magma_int_t  lddwork,
magma_int_t  nb 
)
void magmablas_cgetmatrix_transpose_mgpu ( magma_int_t  ngpu,
magma_queue_t  stream[][2],
magmaFloatComplex_ptr  dAT[],
magma_int_t  ldda,
magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magmablas_chemm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_chemm_1gpu ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex *  C,
magma_int_t  ldc,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream 
)
void magmablas_chemm_1gpu_old ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex *  C,
magma_int_t  ldc,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream 
)
void magmablas_chemm_mgpu ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lddwork,
magmaFloatComplex *  C,
magma_int_t  ldc,
magmaFloatComplex *  work[],
magma_int_t  ldwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][20],
magma_int_t  nbevents 
)
void magmablas_chemm_mgpu_com ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lddwork,
magmaFloatComplex *  C,
magma_int_t  ldc,
magmaFloatComplex *  work[],
magma_int_t  ldwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents,
magma_int_t  gnode[MagmaMaxGPUs][MagmaMaxGPUs+2],
magma_int_t  nbcmplx 
)

Definition at line 20 of file chemm_mgpu.cpp.

References dA, dB, dC, dwork, MAGMA_C_ONE, magma_ceildiv(), magma_cgemm(), magma_event_record(), magma_getdevice(), magma_queue_wait_event(), magma_setdevice(), magmablas_cgeadd(), magmablas_csymmetrize(), magmablas_csymmetrize_tiles(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaConjTrans, MagmaLower, MagmaMaxGPUs, MagmaNoTrans, and min.

33 {
34  #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
35  #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
36  #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
37  #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
38  #define C(i, j) (C + (i) + (j)*ldc)
39  //printf("####################################################\n");
40  //printf(" start chemm \n");
41  //printf("####################################################\n");
42 
43  assert( ldda >= m );
44  assert( lddb >= m );
45  assert( lddc >= m );
46  assert( nstream >= ngpu );
47  assert( nbevents >= ngpu*ngpu );
48 
49 
50  magmaFloatComplex c_one = MAGMA_C_ONE;
51 
52  magmaFloatComplex *dwork1[MagmaMaxGPUs];
53  magmaFloatComplex *dwork2[MagmaMaxGPUs];
54 
55 
56  magma_int_t maxgsize = n*m;
57  magma_int_t lddwork = lddc;
58  magma_int_t ldwork = m;
59  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
60  dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork
61  dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu
62  }
63  assert( dworksiz >= (n*lddwork+maxgsize*ngpu) );
64  assert( worksiz >= (n*ldwork) );
65 
66 
67  magma_device_t cdev;
68  magma_getdevice( &cdev );
69  magma_queue_t cstream;
70  magmablasGetKernelStream(&cstream);
71 
72 
73  magma_int_t dev, devperm, myblk, mycolsize, myblkoffst;
74  magma_int_t gmaster;
75  magma_int_t masterdev, lcdev, lccolsize, myngpu;
76 
77  magma_int_t stdev = (offset/nb)%ngpu;
78  magma_int_t blockoffset = offset % nb;
79  magma_int_t fstblksiz = 0;
80  if(blockoffset>0){
81  fstblksiz = min(m, (nb - blockoffset));
82  }
83  //magma_int_t nbblk = magma_ceildiv(m, nb);
84  magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb);
85  magma_int_t remm = m- fstblksiz;
86  magma_int_t nbblkoffst = offset/nb;
87 
88 
89  magma_int_t nblstblks = -1;
90  magma_int_t devlstblk = -1;
91  magma_int_t lstblksiz = remm%nb;
92  if(lstblksiz>0){
93  nblstblks = nbblk%ngpu;
94  devlstblk = (nblstblks-1+ngpu)%ngpu;
95  }
96 
97  magma_int_t nbcmplxactive = 0;
98  magma_int_t cmplxisactive[MagmaMaxGPUs];
99  magma_int_t gpuisactive[MagmaMaxGPUs];
100  memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
101  memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
102 
103 
104  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
105  magma_setdevice( dev );
106  magmablasSetKernelStream( streams[ dev ][ 0 ] );
107  cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(magmaFloatComplex) );
108  // put all dC on all dev to 0 except the one which
109  // hold i==0 because this one has to multiply by beta.
110  if(dev!=stdev){
111  cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(magmaFloatComplex) );
112  }
113  }
114 
115  magma_int_t newoffset = offset;
116  // 1. symmetrize
117  if(blockoffset>0){
118  newoffset = offset+fstblksiz; // newoffset is adjusted over nb
119  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0);
120  //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz);
121  magma_setdevice( stdev );
122  magmablasSetKernelStream( streams[ stdev ][ 0 ] );
123  magmablas_csymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb );
124  }
125 
126  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
127  magma_int_t newstdev = (newoffset/nb)%ngpu;
128  magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb
129  magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 );
130  magma_int_t devperm = (dev-newstdev+ngpu)%ngpu;
131  magma_int_t nbblkoffst = newoffset/nb;
132  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
133  //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk);
134  magma_setdevice( dev );
135  magmablasSetKernelStream( streams[ dev ][ 0 ] );
136  magmablas_csymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb );
137  if(remm%nb>0){
138  magma_int_t nblstblks = (nbblk+1)%ngpu;
139  magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu;
140  //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb);
141  if(devperm==devlstblk)
142  magmablas_csymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile
143  }
144  }
145 
146 
147 
148 
149 /*
150  magma_int_t siz = m+offset;
151  magmaFloatComplex *R=(magmaFloatComplex *) malloc(siz*siz*sizeof(magmaFloatComplex));
152  // collecte back A
153  magmablas_cgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb );
154  magma_setdevice( 0 );
155  magmablasSetKernelStream( streams[ dev ][ 0 ] );
156  //magma_cgetmatrix( siz, siz, dA[0], ldda, R, siz );
157  FILE *trace_file;
158  trace_file = fopen("AJETE/Aafter", "w");
159  for (int j = 0; j < siz ; j++)
160  for (int i = 0; i < siz ; i++)
161  fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]);
162  fclose(trace_file);
163 return;
164 */
165 
166 
167  // ROW GEMM transpose a row and make a gemm with a block
168  // if only 1 GPU used the ROW GEMM is integrated with the
169  // COL GEMM (better accuracy observed) and better perf
170  if(ngpu>1){
171  for( magma_int_t i = fstblksiz; i < m; i += nb ) {
172  magma_int_t ib = min( nb, m-i ); // block size
173  magma_int_t ioff = i + offset; // start global index in parent matrix
174  //magma_int_t dev = (ioff / nb) % ngpu;
175  magma_int_t nbblkoffst = offset/nb;
176  magma_int_t nbblk = magma_ceildiv(i, nb);
177  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
178 
179 
180  magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 );
181  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
182 
183  magma_int_t myrowsize = myblk * nb;
184  magma_int_t coloffset = myblkoffst*nb;
185  if(dev==stdev) {
186  myrowsize = myrowsize -blockoffset;
187  coloffset = myblkoffst*nb+blockoffset;
188  }
189  //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize);
190  if(myrowsize>0){
191  magma_setdevice( dev );
192  magmablasSetKernelStream( streams[ dev ][ 1 ] );
193  magma_cgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib,
194  alpha, dA(dev,ioff,coloffset), ldda,
195  dB(dev,i,0), lddb,
196  c_one, dwork(dev,0,0), lddwork );
197  }
198  }
199  }
200  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
201  magma_setdevice( dev );
202  magma_event_record(redevents[dev][1], streams[dev][1]);
203  }
204  }
205 
206 
207  // COL GEMM
208  // blockoffset is offset within first block; for subsequent blocks it is 0
209  if(blockoffset>0){
210  magma_int_t ib = min( nb-blockoffset, m ); // block size
211  magma_int_t iblock = (offset / nb) / ngpu; // local block id
212  magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix
213  magma_setdevice( stdev );
214  magmablasSetKernelStream( streams[ stdev ][ 0 ] );
215  //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib);
217  alpha, dA(stdev,offset,di), ldda,
218  dB(stdev,0,0), lddb,
219  beta, dC(stdev,0,0), lddc );
220  }
221 
222 
223 
224  // COL GEMM
225  for( magma_int_t i = fstblksiz; i < m; i += nb ) {
226  magma_int_t ib = min( nb, m-i ); // block size
227  magma_int_t ioff = i + offset; // start global index in parent matrix
228  magma_int_t iblock = (ioff / nb) / ngpu; // local block id
229  magma_int_t dev = (ioff / nb) % ngpu;
230  magma_int_t di = iblock*nb; // local index in parent matrix
231 
232  //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib);
233 
234  magma_setdevice( dev );
235  magmablasSetKernelStream( streams[ dev ][ 0 ] );
236  if(i==0){
237  magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
238  alpha, dA(dev,ioff,di), ldda,
239  dB(dev,i,0), lddb,
240  beta, dC(dev,i,0), lddc );
241  }else{
242  magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
243  alpha, dA(dev,ioff,di), ldda,
244  dB(dev,i,0), lddb,
245  c_one, dC(dev,i,0), lddc );
246  }
247  magma_event_record(redevents[dev][0], streams[dev][0]);
248  // if only 1 GPU is used, do the ROW GEMM
249  if(ngpu==1){
250  // NOTE THAT because the COL gemm write dC below the diagonal (i)
251  // and the ROW GEMM write dC from 0 to diag-1, so they could
252  // run in parallel on diferent stream.
253  //
254  // NO NO NO because
255  // it might happen that col finished i and strated i+1 while row still at i
256  // magmablasSetKernelStream( streams[ dev ][ 0 ] );
258  alpha, dA(dev,ioff,offset), ldda,
259  dB(dev,i,0), lddb,
260  c_one, dC(dev,0,0), lddc );
261  }
262  }
263 
264 
265 
266  if(ngpu>1){
267  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
268  magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb);
269  magma_int_t nbblkrow = nbblk-1;
270  magma_int_t devperm = (dev-stdev+ngpu)%ngpu;
271  magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 );
272  magma_int_t myrowsize = myblk * nb;
273  if(dev==stdev) {
274  myrowsize = myrowsize - blockoffset;
275  }
276 
277  //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize);
278  if(myrowsize>0){
279  magma_setdevice( dev );
280  magmablasSetKernelStream( streams[ dev ][ 0 ] );
281  magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]);
282  //magma_queue_sync( streams[ dev ][ 1 ] );
283  // for each dev add the computed ROW block each on its placment with dC
284  for( magma_int_t blki = 0; blki < myblk; ++blki){
285  magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset;
286  magma_int_t lcblki = blki*nb;
287  magma_int_t ib = nb;// min(nb, m-gbblki);
288  if(dev==stdev){
289  lcblki = blki*nb-blockoffset;
290  if(blki==0){
291  gbblki = 0;
292  lcblki = 0;
293  ib = nb-blockoffset;
294  }
295  }
296  magmablas_cgeadd(ib, n, c_one,
297  &dwork[dev][lcblki], lddwork,
298  &dC[dev][gbblki] , lddc );
299  }
300  magma_event_record(redevents[dev][0], streams[dev][0]);
301  }
302  }
303  }
304 
305 
306 
307 
308  // ===========================================================
309  // COMMUNICATION ALL_REDUCE_SUM
310  // ===========================================================
311  if(ngpu==1){
312  return;
313  }
314  // INITIALIZE COMM
315  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
316  masterdev = -1;
317  gnode[cmplxid][MagmaMaxGPUs+1] = -1;
318  myngpu = gnode[cmplxid][MagmaMaxGPUs];
319  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
320  dev = gnode[cmplxid][idev];
321  devperm = (dev-stdev+ngpu)%ngpu;
322  myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
323  mycolsize = myblk*nb;
324  myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));
325  if(dev==stdev){
326  mycolsize -= blockoffset;
327  myblkoffst += blockoffset; // local index in parent matrix
328  }
329  if((devperm==devlstblk)&&(lstblksiz>0)){
330  mycolsize -= (nb-(remm%nb));
331  }
332  mycolsize = min(mycolsize, m);
333  if(mycolsize>0){
334  gpuisactive[dev] = mycolsize;
335  if(masterdev==-1) {
336  masterdev = dev;
337  nbcmplxactive = nbcmplxactive +1;
338  cmplxisactive[cmplxid] = 1;
339  gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
340  }
341  }
342  }
343  }
344 /*
345  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
346  magma_setdevice( dev );
347  cudaDeviceSynchronize();
348  }
349 */
350  //*******************************
351  // each GPU send its result
352  // to its master. The master make
353  // the addition and then send to
354  // to the masters of other complex
355  // and receive from the masters of
356  // other complex make the addition
357  // and broadcast locally the final
358  // result.
359  //*******************************
360  //printf("=======================================================================\n");
361  //printf(" sending to my master \n");
362  //printf("=======================================================================\n");
363  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
364  myngpu = gnode[cmplxid][MagmaMaxGPUs];
365  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
366  //check if complex is active
367  if(masterdev!=-1){
368  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
369  dev = gnode[cmplxid][idev];
370  mycolsize = gpuisactive[dev];
371  if(mycolsize>0){
372  // I am an active GPU. if I am not the master, then send my result to my master.
373  // store result on dwork[masterdev][dev*maxgsize]
374  if(dev!=masterdev){
375  magma_setdevice( dev );
376  //printf(" GPU %d sending to my master %d\n", dev, masterdev);
377  // wait the geadd of my ROW and COL GEMM is done
378  magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]);
379  // sending to the master of my complex
380  cudaMemcpy2DAsync(&dwork2[masterdev][maxgsize*dev], m*sizeof(magmaFloatComplex),
381  &dC[dev][0], lddc*sizeof(magmaFloatComplex),
382  m*sizeof(magmaFloatComplex), n,
383  cudaMemcpyDeviceToDevice, streams[dev][0]);
384  magma_event_record(redevents[dev][masterdev], streams[dev][0]);
385  } // end I am not the masterdev
386  }// end if mycolsize>0
387  }// for idev
388  }// end of if masterdev!=-1 maening complex is active
389  }// for cmplxid
390 /*
391  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
392  magma_setdevice( dev );
393  cudaDeviceSynchronize();
394  }
395 */
396 
397  //printf("=======================================================================\n");
398  //printf(" each master do addition of local result and broadcast to other masters \n");
399  //printf("=======================================================================\n");
400  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
401  myngpu = gnode[cmplxid][MagmaMaxGPUs];
402  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
403  //check if complex is active
404  if(masterdev!=-1){
405  magma_setdevice( masterdev );
406  // addition is done on stream 0 sequentially
407  magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
408  // wait the geadd of my ROW and COL GEMM is done
409  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]);
410  // ========================================
411  // local addition
412  // ========================================
413  for( magma_int_t l = 0; l < myngpu; ++l ) {
414  lcdev = gnode[cmplxid][l];
415  lccolsize = gpuisactive[lcdev];
416  if((lcdev!=masterdev)&&(lccolsize>0)){
417  //printf(" master %d receiving from %d and adding \n", masterdev, lcdev);
418  // this is an active GPU of my complex.
419  // wait I received what he send it to me and then do addition.
420  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]);
421  magmablas_cgeadd(m, n, c_one,
422  &dwork2[masterdev][maxgsize*lcdev], m,
423  &dC[masterdev][0] , lddc );
424  }
425  }// for l=1:myngpu
426  // because addition is done sequentially on stream 0,
427  // I have to record this to be able to synch using it
428  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
429  // ========================================
430  //
431  // ========================================
432  // send to other masters
433  // ========================================
434  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
435  if(k!=cmplxid){
436  gmaster = gnode[k][MagmaMaxGPUs+1];
437  if(gmaster!=-1){ //complex is active
438  //Master has to wait until finish the local addition then send using gmaster stream.
439  //use stream 0 to make it sequential or stream gmaster to make it parallel.
440  //Now both re the same.
441  //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k);
442  magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]);
443  cudaMemcpy2DAsync(&dwork2[gmaster][maxgsize*masterdev], m*sizeof(magmaFloatComplex),
444  &dC[masterdev][0], lddc*sizeof(magmaFloatComplex),
445  m*sizeof(magmaFloatComplex), n,
446  cudaMemcpyDeviceToDevice, streams[masterdev][gmaster]);
447  magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]);
448  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]);
449  } // end of gmaster!=-1
450  } // end of k!=cmplxid
451  }// for k = 0: nbcmplx
452  // ========================================
453  }// end of if masterdev!=-1 maening complex is active
454  }// for cmplxid
455 /*
456  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
457  magma_setdevice( dev );
458  cudaDeviceSynchronize();
459  }
460 */
461  //printf("=======================================================================\n");
462  //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n");
463  //printf("=======================================================================\n");
464  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
465  myngpu = gnode[cmplxid][MagmaMaxGPUs];
466  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
467  //check if complex is active
468  if(masterdev!=-1){
469  magma_setdevice( masterdev );
470  // addition is done on stream 0 sequentially
471  magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
472  // master has to wait until finishing all the send to other masters.
473  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
474  // ========================================
475  // addition of results from other masters
476  // ========================================
477  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
478  if(k!=cmplxid){
479  gmaster = gnode[k][MagmaMaxGPUs+1];
480  if(gmaster!=-1){ //complex is active
481  //Master has to wait until receiving from gmaster, then do addition using stream 0
482  //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k);
483  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]);
484  magmablas_cgeadd(m, n, c_one,
485  &dwork2[masterdev][maxgsize*gmaster], m,
486  &dC[masterdev][0] , lddc );
487  } // end of gmaster!=-1
488  } // end of k!=cmplxid
489  }// for k = 0: nbcmplx
490  // because addition is done sequentially on stream 0,
491  // I have to record this to be able to synch using it
492  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
493  // ========================================
494  // ========================================
495  // local broadcast of final results
496  // ========================================
497  for( magma_int_t l = 0; l < myngpu; ++l ) {
498  lcdev = gnode[cmplxid][l];
499  lccolsize = gpuisactive[lcdev];
500  if((lcdev!=masterdev)&&(lccolsize>0)){
501  // this is an active GPU of my complex.
502  // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now.
503  // to make it parallel put stream lcdev instead of stream 0
504  //printf(" master %d broadcasting local to %d \n", masterdev, lcdev);
505  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
506  cudaMemcpy2DAsync(&dC[lcdev][0], lddc*sizeof(magmaFloatComplex),
507  &dC[masterdev][0], lddc*sizeof(magmaFloatComplex),
508  m*sizeof(magmaFloatComplex), n,
509  cudaMemcpyDeviceToDevice, streams[masterdev][0]);
510  magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]);
511  }
512  }// for l=1:myngpu
513  // ========================================
514  }// end of if masterdev!=-1 maening complex is active
515  }// for cmplxid
516 /*
517  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
518  magma_setdevice( dev );
519  cudaDeviceSynchronize();
520  }
521 */
522 
523 
524  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
525  myngpu = gnode[cmplxid][MagmaMaxGPUs];
526  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
527  //check if complex is active
528  if(masterdev!=-1){
529  for( magma_int_t l = 0; l < myngpu; ++l ) {
530  lcdev = gnode[cmplxid][l];
531  lccolsize = gpuisactive[lcdev];
532  if(lccolsize>0){
533  magma_setdevice( lcdev );
534  magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]);
535  magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]);
536  }
537  }// for l=1:myngpu
538  }// end of if masterdev!=-1 maening complex is active
539  }// for cmplxid
540 
541 
542 
543  //printf("****************************************************\n");
544  //printf(" finish chemm \n");
545  //printf("****************************************************\n");
546 
547  magma_setdevice( cdev );
548  magmablasSetKernelStream( cstream );
549 
550 }
#define min(a, b)
Definition: common_magma.h:86
void magmablas_csymmetrize(magma_uplo_t uplo, magma_int_t m, magmaFloatComplex_ptr dA, magma_int_t ldda)
magma_queue_t streams[MagmaMaxGPUs]
int magma_int_t
Definition: magmablas.h:12
void magmablas_cgeadd(magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
void magma_cgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
#define dB(dev, i, j)
magma_int_t ldda
#define dwork(dev, i, j)
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define MagmaLower
Definition: magma.h:62
void magma_getdevice(magma_device_t *dev)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define MagmaMaxGPUs
Definition: magma_types.h:255
void magma_event_record(magma_event_t event, magma_queue_t queue)
#define MagmaConjTrans
Definition: magma.h:59
#define MAGMA_C_ONE
Definition: magma.h:154
#define dC(dev, i, j)
#define MagmaNoTrans
Definition: magma.h:57
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define dA(dev, i, j)
void magmablas_csymmetrize_tiles(magma_uplo_t uplo, magma_int_t m, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t ntile, magma_int_t mstride, magma_int_t nstride)

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_chemm_mgpu_spec ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lddwork,
magmaFloatComplex *  C,
magma_int_t  ldc,
magmaFloatComplex *  work[],
magma_int_t  ldwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents,
magma_int_t  gnode[MagmaMaxGPUs][MagmaMaxGPUs+2],
magma_int_t  nbcmplx 
)

Definition at line 17 of file chemm_mgpu_spec.cpp.

References dA, dB, magma_ceildiv(), magma_cgemm(), magma_event_record(), magma_getdevice(), magma_queue_wait_event(), magma_setdevice(), magmablas_clacpy(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaConjTrans, MagmaMaxGPUs, MagmaNoTrans, and min.

31 {
32  #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
33  #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
34  #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
35  #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
36  #define C(i, j) (C + (i) + (j)*ldc)
37 
38  assert( ldda >= m );
39  assert( lddb >= m );
40  assert( lddc >= m );
41  assert( nstream >= ngpu );
42  assert( nbevents >= ngpu*ngpu );
43 
44  magmaFloatComplex *dwork1[MagmaMaxGPUs];
45  magmaFloatComplex *dwork2[MagmaMaxGPUs];
46 
47 
48  magma_int_t lddwork = lddc;
49  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
50  dwork1[dev] = dwork[dev];
51  dwork2[dev] = dwork[dev]+n*lddwork;
52  }
53  assert( dworksiz >= (2*n*lddwork) );
54 
55 
56 
57 
58 
59  magma_device_t cdev;
60  magma_getdevice( &cdev );
61  magma_queue_t cstream;
62  magmablasGetKernelStream(&cstream);
63 
64 
65  magma_int_t dev,devperm,myblk,mycolsize,myblkoffst;
66  magma_int_t gdev,gcolsize,gmaster,gngpu;
67  magma_int_t masterdev,lcdev,lccolsize,myngpu;
68 
69  magma_int_t stdev = (offset/nb)%ngpu;
70  magma_int_t blockoffset = offset % nb;
71  magma_int_t fstblksiz = 0;
72  if(blockoffset>0){
73  fstblksiz = min(m, (nb - blockoffset));
74  }
75  //magma_int_t nbblk = magma_ceildiv(m,nb);
76  magma_int_t nbblk = magma_ceildiv((m+blockoffset),nb);
77  magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk,ngpu);
78  magma_int_t remm = m- fstblksiz;
79  magma_int_t nbblkoffst = offset/nb;
80 
81 
82  magma_int_t nblstblks = -1;
83  magma_int_t devlstblk = -1;
84  magma_int_t lstblksiz = remm%nb;
85  if(lstblksiz>0){
86  nblstblks = nbblk%ngpu;
87  devlstblk = (nblstblks-1+ngpu)%ngpu;
88  }
89 
90  magma_int_t nbcmplxactive = 0;
91  magma_int_t cmplxisactive[MagmaMaxGPUs];
92  magma_int_t gpuisactive[MagmaMaxGPUs];
93  memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
94  memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
95 
96 
97  //*******************************
98  // each GPU make a GEMM with the
99  // transpose of its blocks to compute
100  // a final portion of X=A*VT
101  //*******************************
102  /* dB = V*T already ==> dB' = T'*V'
103  * compute T'*V'*X is equal to compute locally (VT)'_i*X_i
104  * then each GPU broadcast its X_i to assemble the full X which is used
105  * to compute W = X - 0.5 * V * T'*V'*X = X - 0.5 * V *dwork3
106  */
107  if(ngpu ==1){
108  magma_setdevice( 0 );
109  magmablasSetKernelStream( streams[ 0 ][ 0 ] );
110  // compute X[me] = A*VT = A[me]^tr *VT;
112  alpha, dA(0,offset,offset), ldda,
113  dB[0], lddb,
114  beta, dC[0], lddc );
115  return;
116  }
117  //ngpu>1
118  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
119  masterdev = -1;
120  gnode[cmplxid][MagmaMaxGPUs+1] = -1;
121  myngpu = gnode[cmplxid][MagmaMaxGPUs];
122  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
123  dev = gnode[cmplxid][idev];
124  devperm = (dev-stdev+ngpu)%ngpu;
125  myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
126  mycolsize = myblk*nb;
127  myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));
128  if(dev==stdev){
129  mycolsize -= blockoffset;
130  myblkoffst += blockoffset; // local index in parent matrix
131  }
132  if((devperm==devlstblk)&&(lstblksiz>0)){
133  mycolsize -= (nb-(remm%nb));
134  }
135  mycolsize = min(mycolsize,m);
136 
137 
138  if(mycolsize>0){
139  if(masterdev==-1) masterdev = dev;
140  //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d,%d,%d) ==> dwork(%d,%d)\n",dev,devperm,cmplxid,masterdev,nbblk,myblk,m,n,mycolsize,stdev,fstblksiz,devlstblk,remm%nb,dev,offset,myblkoffst,dev,maxgsize*dev);
141  gpuisactive[dev] = mycolsize;
142  magma_setdevice( dev );
143  magmablasSetKernelStream( streams[ dev ][ dev ] );
144 
145  magma_cgemm( MagmaConjTrans, MagmaNoTrans, mycolsize, n, m,
146  alpha, dA(dev,offset,myblkoffst), ldda,
147  dB(dev,0,0), lddb,
148  beta, &dwork[dev][maxgsize*dev], mycolsize );
149  magma_event_record(redevents[dev][dev*ngpu+dev], streams[dev][dev]);
150  }
151  if(dev == masterdev){
152  nbcmplxactive = nbcmplxactive +1;
153  cmplxisactive[cmplxid] = 1;
154  gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
155  }
156  }
157  }
158 
159 
160 
161 /*
162  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
163  magma_setdevice( dev );
164  magma_queue_sync( streams[ dev ][ dev ] );
165  }
166 */
167 
168 
169  //*******************************
170  // each Master GPU has the final
171  // result either by receiving
172  // from CPU of by making the add
173  // by himself, so now it is time
174  // to broadcast over the GPUs of
175  // its board.
176  //*******************************
177  //printf("=======================================================================\n");
178  //printf(" sending \n");
179  //printf("=======================================================================\n");
180 
181  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
182  myngpu = gnode[cmplxid][MagmaMaxGPUs];
183  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
184  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
185  dev = gnode[cmplxid][idev];
186  mycolsize = gpuisactive[dev];
187  if(mycolsize>0){
188  // I am an active GPU send my portion local
189  // to all active gpu of my cmplex and global to the
190  // active master of the other complex and they should
191  // send it out to their actives slaves.
192  magma_setdevice( dev );
193  //==============================================
194  // sending to the master of the active complex
195  //==============================================
196  //printf ("\n\n**************GPU %d\n ",dev);
197  //printf (" GPU %d sending to cmplx masters\n",dev);
198  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
199  if(k!=cmplxid){
200  gmaster = gnode[k][MagmaMaxGPUs+1];
201  if(gmaster!=-1){ //complex is active
202  //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n",dev,cmplxid,gmaster,k,mycolsize,redevents[dev][gmaster*ngpu+dev]);
203  magma_queue_wait_event(streams[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]);
204  cudaMemcpy2DAsync(&dwork[gmaster][maxgsize*dev], mycolsize*sizeof(magmaFloatComplex),
205  &dwork[dev][maxgsize*dev], mycolsize*sizeof(magmaFloatComplex),
206  mycolsize*sizeof(magmaFloatComplex), n,
207  cudaMemcpyDeviceToDevice, streams[dev][gmaster]);
208  magma_event_record(redevents[dev][gmaster*ngpu+dev], streams[dev][gmaster]);
209  }
210  }
211  }
212  //==============================================
213  //
214  //==============================================
215  // sending to the active GPUs of my complex
216  //==============================================
217  //printf (" GPU %d sending internal\n",dev);
218  for( magma_int_t l = 0; l < myngpu; ++l ) {
219  lcdev = gnode[cmplxid][l];
220  lccolsize = gpuisactive[lcdev];
221  if((lcdev!=dev)&&(lccolsize>0)){
222  //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n",dev,cmplxid,lcdev,mycolsize,redevents[dev][lcdev*ngpu+dev]);
223  magma_queue_wait_event(streams[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]);
224  cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*dev], mycolsize*sizeof(magmaFloatComplex),
225  &dwork[dev][maxgsize*dev], mycolsize*sizeof(magmaFloatComplex),
226  mycolsize*sizeof(magmaFloatComplex), n,
227  cudaMemcpyDeviceToDevice, streams[dev][lcdev]);
228  magma_event_record(redevents[dev][lcdev*ngpu+dev], streams[dev][lcdev]);
229  }
230  }
231  //==============================================
232  }// end if mycolsize>0
233  }// for idev
234  }// for cmplxid
235 
236 
237  //printf("=======================================================================\n");
238  //printf(" master wait and resend internally \n");
239  //printf("=======================================================================\n");
240 
241  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
242  myngpu = gnode[cmplxid][MagmaMaxGPUs];
243  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
244  //==============================================
245  // if I am active master so wait receiving contribution
246  // of the GPUs of other complex and send it locally
247  //==============================================
248  if(masterdev != -1){
249  mycolsize = gpuisactive[masterdev];
250  magma_setdevice( masterdev );
251  //printf(" GPU %d distributing internal\n",masterdev);
252  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
253  if(k!=cmplxid){
254  gngpu = gnode[k][MagmaMaxGPUs];
255  for( magma_int_t g = 0; g < gngpu; ++g ) {
256  gdev = gnode[k][g];
257  gcolsize = gpuisactive[gdev];
258  // check if I received from this GPU,
259  // if yes send it to my group
260  if(gcolsize>0){
261  magma_queue_wait_event(streams[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]);
262  for( magma_int_t l = 0; l < myngpu; ++l ) {
263  lcdev = gnode[cmplxid][l];
264  lccolsize = gpuisactive[lcdev];
265  if((lcdev!=masterdev)&&(lccolsize>0)){
266  //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev,cmplxid,redevents[gdev][masterdev*ngpu+gdev],gdev,lcdev,gcolsize,redevents[masterdev][lcdev*ngpu+gdev]);
267  cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*gdev], gcolsize*sizeof(magmaFloatComplex),
268  &dwork[masterdev][maxgsize*gdev], gcolsize*sizeof(magmaFloatComplex),
269  gcolsize*sizeof(magmaFloatComplex), n,
270  cudaMemcpyDeviceToDevice, streams[masterdev][gdev]);
271  magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], streams[masterdev][gdev]);
272  }
273  }
274  }
275  }
276  }
277  }
278  }// if active master
279  //==============================================
280  }// for cmplxid
281 
282 
283 
284 
285 
286 /*
287 
288  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
289  magma_setdevice( dev );
290  magma_queue_sync( streams[ dev ][ 0 ] );
291  for( magma_int_t s = 0; s < ngpu; ++s ) {
292  magma_queue_sync( streams[ dev ][ s ] );
293  }
294  }
295 */
296  //printf("=======================================================================\n");
297  //printf(" distributing \n");
298  //printf("=======================================================================\n");
299 
300  magma_int_t lcblki,gbblki,gblk,ib;
301 
302  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
303  myngpu = gnode[cmplxid][MagmaMaxGPUs];
304  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
305  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
306  dev = gnode[cmplxid][idev];
307  mycolsize = gpuisactive[dev];
308  if(mycolsize>0){ // I am an active GPU
309  //printf("\n\n==============GPU %d collecting\n",dev);
310  magma_setdevice( dev );
311  // collect my results first as tyhere is no need to wait to
312  // receive nothing, just wait that my gemm are done.
313  // in theory this should be inside the loop but cuda was not
314  // able to run it first for all gpu and on gpu>0 it was waiting
315  // however it was on different stream so it should run. but maybe
316  // this is because there are too many function call and this make
317  // cuda not handleit so nice. anyway it coul dbe removed when cuda
318  // is able to lunch it first without wait.
319  gdev = dev;
320  gcolsize = gpuisactive[gdev];
321  if(gcolsize>0){
322  devperm = (gdev-stdev+ngpu)%ngpu;
323  gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
324  magmablasSetKernelStream( streams[ dev ][ gdev ] );
325  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
326  //printf (" GPU %d stream %d doing clacpy\n",dev,streams[ dev ][ gdev ]);
327  for( magma_int_t blki = 0; blki < gblk; ++blki){
328  gbblki = (blki*ngpu + devperm)*nb - blockoffset;
329  lcblki = blki*nb;
330  ib = nb;//min(nb,m-gbblki);
331  if(gdev==stdev){
332  lcblki = blki*nb-blockoffset;
333  if(blki==0){
334  gbblki = 0;
335  lcblki = 0;
336  ib = nb-blockoffset;
337  }
338  }
339  ib = min(ib,m-gbblki);
340  //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
341  magmablas_clacpy( 'A', ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
342  }// end blki
343  }
344 
345 
346 
347  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
348  gngpu = gnode[k][MagmaMaxGPUs];
349  for( magma_int_t g = 0; g < gngpu; ++g ) {
350  gdev = gnode[k][g];
351  gcolsize = gpuisactive[gdev];
352  // if gcolsize>0, ==> gpu gdev was active and so
353  // I received from him/computed a portion of dwork,
354  // so go over its gblk and distribute it on dC.
355  if(gdev!=dev){
356  if(gcolsize>0){
357  devperm = (gdev-stdev+ngpu)%ngpu;
358  gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
359  magmablasSetKernelStream( streams[ dev ][ gdev ] );
360  if(k==cmplxid){
361  //we are on the same group so wait on event issued by gdev for me citing his id
362  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
363  //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[gdev][dev*ngpu+gdev],gdev,gcolsize);
364  }else{
365  //we are on different group so:
366  //if I am the master wait on the event issued by gdev for me citing his id
367  //else wait event issued by my master for me on the behalf of gdev
368  //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[masterdev][dev*ngpu+gdev],gdev,gcolsize);
369  if(dev==masterdev)
370  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
371  else
372  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]);
373  }
374  //printf (" GPU %d stream %d doing clacpy\n",dev,streams[ dev ][ gdev ]);
375  for( magma_int_t blki = 0; blki < gblk; ++blki){
376  gbblki = (blki*ngpu + devperm)*nb - blockoffset;
377  lcblki = blki*nb;
378  ib = nb;//min(nb,m-gbblki);
379  if(gdev==stdev){
380  lcblki = blki*nb-blockoffset;
381  if(blki==0){
382  gbblki = 0;
383  lcblki = 0;
384  ib = nb-blockoffset;
385  }
386  }
387  ib = min(ib,m-gbblki);
388  //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
389  magmablas_clacpy( 'A', ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
390  }// end blki
391  }// en gcolsize>0 meaning gdev is active
392  } // end if gdev != dev
393  }// end loop over the g gpus of the cmplx k
394  }//end loop over the complex k
395  }// end mycolsize>0 meaning that I am active
396  }// end loop over idev of cmplxid
397  }// end loop of the cmplx
398 
399 
400 
401 
402 
403 
404 
405  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
406  magma_setdevice( dev );
407  cudaDeviceSynchronize();
408  }
409 
410  // put back the input gpu and its input stream
411  magma_setdevice( cdev );
412  magmablasSetKernelStream( cstream );
413 
414 }
#define min(a, b)
Definition: common_magma.h:86
#define dC(dev, i, j)
magma_queue_t streams[MagmaMaxGPUs]
int magma_int_t
Definition: magmablas.h:12
void magma_cgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
magma_int_t ldda
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define dA(dev, i, j)
void magma_getdevice(magma_device_t *dev)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define MagmaMaxGPUs
Definition: magma_types.h:255
void magma_event_record(magma_event_t event, magma_queue_t queue)
#define MagmaConjTrans
Definition: magma.h:59
void magmablas_clacpy(magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb)
#define MagmaNoTrans
Definition: magma.h:57
#define dB(dev, i, j)
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define dwork(dev, i, j)

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_chemm_mgpu_spec33 ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magmaFloatComplex_ptr  dVIN[],
magma_int_t  lddv,
magma_int_t  voffst,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lddwork,
magmaFloatComplex *  C,
magma_int_t  ldc,
magmaFloatComplex *  work[],
magma_int_t  ldwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents,
magma_int_t  gnode[MagmaMaxGPUs][MagmaMaxGPUs+2],
magma_int_t  nbcmplx 
)
magma_int_t magmablas_chemv ( magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

magma_int_t magmablas_chemv2 ( magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dX,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dY,
magma_int_t  incy,
magmaFloatComplex_ptr  dwork,
magma_int_t  lwork 
)

Here is the caller graph for this function:

magma_int_t magmablas_chemv_mgpu ( magma_int_t  num_gpus,
magma_int_t  k,
char  uplo,
magma_int_t  n,
magma_int_t  nb,
magmaFloatComplex  alpha,
magmaFloatComplex **  da,
magma_int_t  ldda,
magma_int_t  offset,
magmaFloatComplex **  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex **  dy,
magma_int_t  incy,
magmaFloatComplex **  dwork,
magma_int_t  ldwork,
magmaFloatComplex *  work,
magmaFloatComplex *  w,
magma_queue_t  stream[][10] 
)
magma_int_t magmablas_chemv_mgpu_32_offset ( char  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex **  A,
magma_int_t  lda,
magmaFloatComplex **  X,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex **  Y,
magma_int_t  incy,
magmaFloatComplex **  work,
magma_int_t  lwork,
magma_int_t  num_gpus,
magma_int_t  nb,
magma_int_t  offset,
magma_queue_t  stream[][10] 
)
magma_int_t magmablas_chemv_mgpu_offset ( char  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex **  A,
magma_int_t  lda,
magmaFloatComplex **  X,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex **  Y,
magma_int_t  incy,
magmaFloatComplex **  work,
magma_int_t  lwork,
magma_int_t  num_gpus,
magma_int_t  nb,
magma_int_t  offset,
magma_queue_t  stream[][10] 
)
magma_int_t magmablas_chemv_sync ( magma_int_t  num_gpus,
magma_int_t  k,
magma_int_t  n,
magmaFloatComplex *  work,
magmaFloatComplex *  w,
magma_queue_t  stream[][10] 
)
void magmablas_cher2k ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
float  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_cher2k_mgpu2 ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  ldda,
magma_int_t  aoff,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magma_int_t  boff,
float  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magma_int_t  offset,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream 
)

Definition at line 157 of file cher2k_mgpu.cpp.

References __func__, dA, dB, dC, MAGMA_C_CNJG, MAGMA_C_MAKE, MAGMA_C_ONE, magma_cgemm(), magma_getdevice(), magma_setdevice(), magma_xerbla(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaConjTrans, MagmaNoTrans, max, and min.

163 {
164  #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
165  #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
166  #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
167 
168  /* Check arguments */
169  magma_int_t info = 0;
170  if ( ! (uplo == 'l' || uplo == 'L')) {
171  info = -1; // 'u' not yet handled
172  } else if ( ! (trans == 'n' || trans == 'N')) {
173  info = -2; // 'c' not yet handled
174  } else if ( n < 0 ) {
175  info = -3;
176  } else if ( k < 0 ) {
177  info = -4;
178  } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) ||
179  ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) {
180  info = -7;
181  } else if ( aoffset < 0 || aoffset > lda ) {
182  info = -8;
183  } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) ||
184  ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) {
185  info = -10;
186  } else if ( boffset < 0 || boffset > ldb ) {
187  info = -11;
188  } else if ( ldc < max(1,n) ) {
189  info = -13;
190  } else if ( coffset < 0 || coffset > ldc ) {
191  info = -14;
192  } else if ( ngpu <= 0 ) {
193  info = -15;
194  } else if ( nb <= 0 ) {
195  info = -16;
196  } else if ( nstream <= 0 ) {
197  info = -18;
198  }
199  if ( info != 0 ) {
200  magma_xerbla( __func__, -(info) );
201  return;
202  }
203 
204  const magmaFloatComplex c_one = MAGMA_C_ONE;
205  magmaFloatComplex cbeta = MAGMA_C_MAKE( beta, 0. );
206 
207  magma_int_t ib, ioff, iblock, idev, di, s;
208 
209  magma_device_t cdev;
210  magma_queue_t cqueue;
211  magma_getdevice( &cdev );
212  magmablasGetKernelStream( &cqueue );
213 
214  // loop over all blocks
215  // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
216  // blockoffset is offset within first block; for subsequent blocks it is 0
217  magma_int_t blockoffset = coffset % nb;
218  for( magma_int_t i = 0; i < n; i += ib ) {
219  ib = min( nb-blockoffset, n-i ); // block size
220  ioff = i + coffset; // global index in parent matrix
221  iblock = (ioff / nb) / ngpu; // local block id
222  idev = (ioff / nb) % ngpu; // device with this block
223  di = iblock*nb + blockoffset; // local index in parent matrix
224 
225  magma_setdevice( idev );
226  s = iblock % nstream;
227  magmablasSetKernelStream( streams[ idev ][ s ] );
228 
229  // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
230  //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
232  alpha, dA(idev,i,0), lda,
233  dB(idev,i,0), ldb,
234  cbeta, dC(idev,ioff,di), ldc );
235  blockoffset = 0;
236  }
237 
238  // second loop does C = conjf(alpha)*B*A' + C_hat
239  alpha = MAGMA_C_CNJG( alpha );
240  blockoffset = coffset % nb;
241  for( magma_int_t i = 0; i < n; i += ib ) {
242  ib = min( nb-blockoffset, n-i ); // block size
243  ioff = i + coffset; // global index in parent matrix
244  iblock = (ioff / nb) / ngpu; // local block id
245  idev = (ioff / nb) % ngpu; // device with this block
246  di = iblock*nb + blockoffset; // local index in parent matrix
247 
248  magma_setdevice( idev );
249  s = iblock % nstream;
250  magmablasSetKernelStream( streams[ idev ][ s ] );
251 
252  // C[i:n,i] += conjf(alpha) * B[i:n,0] * A[i,0]'
253  //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
255  alpha, dB(idev,i,0), ldb,
256  dA(idev,i,0), lda,
257  c_one, dC(idev,ioff,di), ldc );
258  blockoffset = 0;
259  }
260 
261  magma_setdevice( cdev );
262  magmablasSetKernelStream( cqueue );
263 }
#define min(a, b)
Definition: common_magma.h:86
#define __func__
Definition: common_magma.h:65
#define dA(dev, i, j)
magma_queue_t streams[MagmaMaxGPUs]
int magma_int_t
Definition: magmablas.h:12
#define MAGMA_C_CNJG(v, t)
Definition: magma.h:142
void magma_cgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaConjTrans
Definition: magma.h:59
#define dC(dev, i, j)
#define MAGMA_C_ONE
Definition: magma.h:154
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define MAGMA_C_MAKE(r, i)
Definition: magma.h:145
#define dB(dev, i, j)

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_cher2k_mgpu_spec ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dA[],
magma_int_t  lda,
magma_int_t  aoff,
magmaFloatComplex_ptr  dB[],
magma_int_t  ldb,
magma_int_t  boff,
float  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  ldc,
magma_int_t  offset,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream 
)

Definition at line 160 of file cher2k_mgpu_spec.cpp.

References __func__, dA, dB, dC, MAGMA_C_CNJG, MAGMA_C_MAKE, MAGMA_C_ONE, magma_cgemm(), magma_getdevice(), magma_setdevice(), magma_xerbla(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaConjTrans, MagmaNoTrans, max, and min.

166 {
167  #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
168  #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
169  #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
170 
171  /* Check arguments */
172  magma_int_t info = 0;
173  if ( ! (uplo == 'l' || uplo == 'L')) {
174  info = -1; // 'u' not yet handled
175  } else if ( ! (trans == 'n' || trans == 'N')) {
176  info = -2; // 'c' not yet handled
177  } else if ( n < 0 ) {
178  info = -3;
179  } else if ( k < 0 ) {
180  info = -4;
181  } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) ||
182  ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) {
183  info = -7;
184  } else if ( aoffset < 0 || aoffset > lda ) {
185  info = -8;
186  } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) ||
187  ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) {
188  info = -10;
189  } else if ( boffset < 0 || boffset > ldb ) {
190  info = -11;
191  } else if ( ldc < max(1,n) ) {
192  info = -13;
193  } else if ( coffset < 0 || coffset > ldc ) {
194  info = -14;
195  } else if ( ngpu <= 0 ) {
196  info = -15;
197  } else if ( nb <= 0 ) {
198  info = -16;
199  } else if ( nstream <= 0 ) {
200  info = -18;
201  }
202  if ( info != 0 ) {
203  magma_xerbla( __func__, -(info) );
204  return;
205  }
206 
207  const magmaFloatComplex c_one = MAGMA_C_ONE;
208  magmaFloatComplex cbeta = MAGMA_C_MAKE( beta, 0. );
209 
210  magma_int_t ib, ioff, iblock, idev, di, s;
211 
212  magma_device_t cdev;
213  magma_queue_t cqueue;
214  magma_getdevice( &cdev );
215  magmablasGetKernelStream( &cqueue );
216 
217  // loop over all blocks
218  // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
219  // blockoffset is offset within first block; for subsequent blocks it is 0
220  magma_int_t blockoffset = coffset % nb;
221  for( magma_int_t i = 0; i < n; i += ib ) {
222  ib = min( nb-blockoffset, n-i ); // block size
223  ioff = i + coffset; // global index in parent matrix
224  iblock = (ioff / nb) / ngpu; // local block id
225  idev = (ioff / nb) % ngpu; // device with this block
226  di = iblock*nb + blockoffset; // local index in parent matrix
227 
228  magma_setdevice( idev );
229  s = iblock % nstream;
230  magmablasSetKernelStream( streams[ idev ][ s ] );
231 
232  // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
233  //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
235  alpha, dA(idev,0,0), lda,
236  dB(idev,i,0), ldb,
237  cbeta, dC(idev,coffset,di), ldc );
238  blockoffset = 0;
239  }
240 
241  // second loop does C = conjf(alpha)*B*A' + C_hat
242  alpha = MAGMA_C_CNJG( alpha );
243  blockoffset = coffset % nb;
244  for( magma_int_t i = 0; i < n; i += ib ) {
245  ib = min( nb-blockoffset, n-i ); // block size
246  ioff = i + coffset; // global index in parent matrix
247  iblock = (ioff / nb) / ngpu; // local block id
248  idev = (ioff / nb) % ngpu; // device with this block
249  di = iblock*nb + blockoffset; // local index in parent matrix
250 
251  magma_setdevice( idev );
252  s = iblock % nstream;
253  magmablasSetKernelStream( streams[ idev ][ s ] );
254 
255  // C[i:n,i] += conjf(alpha) * B[i:n,0] * A[i,0]'
256  //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
258  alpha, dB(idev,0,0), ldb,
259  dA(idev,i,0), lda,
260  c_one, dC(idev,coffset,di), ldc );
261  blockoffset = 0;
262  }
263 
264  magma_setdevice( cdev );
265  magmablasSetKernelStream( cqueue );
266 }
#define min(a, b)
Definition: common_magma.h:86
#define __func__
Definition: common_magma.h:65
#define dB(dev, i, j)
magma_queue_t streams[MagmaMaxGPUs]
int magma_int_t
Definition: magmablas.h:12
#define MAGMA_C_CNJG(v, t)
Definition: magma.h:142
void magma_cgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex_const_ptr dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, magma_int_t lddc)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define dC(dev, i, j)
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define dA(dev, i, j)
#define MagmaConjTrans
Definition: magma.h:59
#define MAGMA_C_ONE
Definition: magma.h:154
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define MAGMA_C_MAKE(r, i)
Definition: magma.h:145

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_cher2k_mgpu_spec324 ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dVIN[],
magma_int_t  lddv,
magma_int_t  voff,
magmaFloatComplex_ptr  dWIN[],
magma_int_t  lddw,
magma_int_t  woff,
float  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magma_int_t  offset,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lndwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents 
)
void magmablas_cher2k_mgpu_spec325 ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_ptr  dVIN[],
magma_int_t  lddv,
magma_int_t  voff,
magmaFloatComplex_ptr  dWIN[],
magma_int_t  lddw,
magma_int_t  woff,
float  beta,
magmaFloatComplex_ptr  dC[],
magma_int_t  lddc,
magma_int_t  offset,
magmaFloatComplex_ptr  dwork[],
magma_int_t  lndwork,
magma_int_t  ngpu,
magma_int_t  nb,
magmaFloatComplex **  harray[],
magmaFloatComplex_ptr darray[],
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents 
)
void magmablas_cherk ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
float  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
float  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_clacpy ( magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb 
)

Here is the caller graph for this function:

void magmablas_clacpy_batched ( magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr const *  dAarray,
magma_int_t  ldda,
magmaFloatComplex_ptr dBarray,
magma_int_t  lddb,
magma_int_t  batchCount 
)

Here is the caller graph for this function:

float magmablas_clange ( magma_norm_t  norm,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloat_ptr  dwork 
)
float magmablas_clanhe ( magma_norm_t  norm,
magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloat_ptr  dwork 
)

Here is the caller graph for this function:

float magmablas_clansy ( magma_norm_t  norm,
magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloat_ptr  dwork 
)
void magmablas_clascl ( char  type,
magma_int_t  kl,
magma_int_t  ku,
float  cfrom,
float  cto,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magma_int_t info 
)

Here is the caller graph for this function:

void magmablas_claset ( magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)

Here is the caller graph for this function:

void magmablas_claset_identity ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)

Here is the caller graph for this function:

void magmablas_claswp ( magma_int_t  n,
magmaFloatComplex_ptr  dAT,
magma_int_t  ldda,
magma_int_t  i1,
magma_int_t  i2,
const magma_int_t ipiv,
magma_int_t  inci 
)

Here is the caller graph for this function:

void magmablas_claswp2 ( magma_int_t  n,
magmaFloatComplex_ptr  dAT,
magma_int_t  ldda,
magma_int_t  i1,
magma_int_t  i2,
const magma_int_t d_ipiv 
)

Here is the caller graph for this function:

void magmablas_claswpx ( magma_int_t  n,
magmaFloatComplex_ptr  dAT,
magma_int_t  ldx,
magma_int_t  ldy,
magma_int_t  i1,
magma_int_t  i2,
const magma_int_t ipiv,
magma_int_t  inci 
)

Here is the caller graph for this function:

void magmablas_cpermute_long2 ( magma_int_t  n,
magmaFloatComplex_ptr  dAT,
magma_int_t  ldda,
magma_int_t ipiv,
magma_int_t  nb,
magma_int_t  ind 
)

Here is the caller graph for this function:

void magmablas_cpermute_long3 ( magmaFloatComplex_ptr  dAT,
magma_int_t  ldda,
const magma_int_t ipiv,
magma_int_t  nb,
magma_int_t  ind 
)

Here is the caller graph for this function:

void magmablas_csetmatrix_transpose ( magma_int_t  m,
magma_int_t  n,
const magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dAT,
magma_int_t  ldda,
magmaFloatComplex_ptr  dwork,
magma_int_t  lddwork,
magma_int_t  nb 
)
void magmablas_csetmatrix_transpose_mgpu ( magma_int_t  ngpu,
magma_queue_t  stream[][2],
const magmaFloatComplex *  hA,
magma_int_t  lda,
magmaFloatComplex_ptr  dAT[],
magma_int_t  ldda,
magmaFloatComplex_ptr  dB[],
magma_int_t  lddb,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nb 
)

Here is the caller graph for this function:

void magmablas_cswap ( magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb 
)

Here is the caller graph for this function:

void magmablas_cswapblk ( magma_storev_t  storev,
magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb,
magma_int_t  i1,
magma_int_t  i2,
const magma_int_t ipiv,
magma_int_t  inci,
magma_int_t  offset 
)

Here is the caller graph for this function:

void magmablas_cswapdblk ( magma_int_t  n,
magma_int_t  nb,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magma_int_t  inca,
magmaFloatComplex_ptr  dB,
magma_int_t  lddb,
magma_int_t  incb 
)

Here is the caller graph for this function:

void magmablas_csymm ( magma_side_t  side,
magma_uplo_t  uplo,
magma_int_t  m,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_csymmetrize ( magma_uplo_t  uplo,
magma_int_t  m,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)

Here is the caller graph for this function:

void magmablas_csymmetrize_tiles ( magma_uplo_t  uplo,
magma_int_t  m,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magma_int_t  ntile,
magma_int_t  mstride,
magma_int_t  nstride 
)

Here is the caller graph for this function:

magma_int_t magmablas_csymv ( magma_uplo_t  uplo,
magma_int_t  n,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dx,
magma_int_t  incx,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dy,
magma_int_t  incy 
)

Here is the caller graph for this function:

void magmablas_csyr2k ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex_const_ptr  dB,
magma_int_t  lddb,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_csyrk ( magma_uplo_t  uplo,
magma_trans_t  trans,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex  alpha,
magmaFloatComplex_const_ptr  dA,
magma_int_t  ldda,
magmaFloatComplex  beta,
magmaFloatComplex_ptr  dC,
magma_int_t  lddc 
)
void magmablas_ctranspose ( magmaFloatComplex_ptr  odata,
magma_int_t  ldo,
magmaFloatComplex_const_ptr  idata,
magma_int_t  ldi,
magma_int_t  m,
magma_int_t  n 
)

Here is the caller graph for this function:

void magmablas_ctranspose2 ( magmaFloatComplex_ptr  odata,
magma_int_t  ldo,
magmaFloatComplex_const_ptr  idata,
magma_int_t  ldi,
magma_int_t  m,
magma_int_t  n 
)

Here is the caller graph for this function:

void magmablas_ctranspose2s ( magmaFloatComplex_ptr  odata,
magma_int_t  ldo,
magmaFloatComplex_const_ptr  idata,
magma_int_t  ldi,
magma_int_t  m,
magma_int_t  n,
magma_queue_t  stream 
)
void magmablas_ctranspose_inplace ( magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda 
)

Here is the caller graph for this function:

void magmablas_scnrm2_adjust ( magma_int_t  k,
float *  xnorm,
magmaFloatComplex *  c 
)

Here is the caller graph for this function:

void magmablas_scnrm2_check ( magma_int_t  m,
magma_int_t  num,
magmaFloatComplex *  da,
magma_int_t  ldda,
float *  dxnorm,
float *  lsticc 
)

Here is the caller graph for this function:

void magmablas_scnrm2_cols ( magma_int_t  m,
magma_int_t  n,
magmaFloatComplex_ptr  dA,
magma_int_t  ldda,
magmaFloat_ptr  dxnorm 
)

Here is the caller graph for this function:

void magmablas_scnrm2_row_check_adjust ( magma_int_t  k,
float  tol,
float *  xnorm,
float *  xnorm2,
magmaFloatComplex *  c,
magma_int_t  ldc,
float *  lsticc 
)

Here is the caller graph for this function: