MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zlahru_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for zlahru_m.cpp:

Go to the source code of this file.

Macros

#define PRECISION_z
 
#define dA(d, i, j)   (data->A [d] + (i) + (j)*ldda)
 
#define dTi(d)   (data->Ti[d])
 
#define dV(d, i, j)   (data->V [d] + (i) + (j)*ldv )
 
#define dVd(d, i, j)   (data->Vd[d] + (i) + (j)*ldvd)
 
#define dW(d, i, j)   (data->W [d] + (i) + (j)*ldda)
 
#define dY(d, i, j)   (data->Y [d] + (i) + (j)*ldda)
 

Functions

magma_int_t magma_zlahru_m (magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, struct zgehrd_data *data)
 

Macro Definition Documentation

#define dA (   d,
  i,
 
)    (data->A [d] + (i) + (j)*ldda)
#define dTi (   d)    (data->Ti[d])
#define dV (   d,
  i,
 
)    (data->V [d] + (i) + (j)*ldv )
#define dVd (   d,
  i,
 
)    (data->Vd[d] + (i) + (j)*ldvd)
#define dW (   d,
  i,
 
)    (data->W [d] + (i) + (j)*ldda)
#define dY (   d,
  i,
 
)    (data->Y [d] + (i) + (j)*ldda)
#define PRECISION_z

Definition at line 13 of file zlahru_m.cpp.

Function Documentation

magma_int_t magma_zlahru_m ( magma_int_t  n,
magma_int_t  ihi,
magma_int_t  k,
magma_int_t  nb,
magmaDoubleComplex *  A,
magma_int_t  lda,
struct zgehrd_data data 
)

Definition at line 16 of file zlahru_m.cpp.

References __func__, dA, dTi, dV, dVd, dW, dY, zgehrd_data::ldda, zgehrd_data::ldv, zgehrd_data::ldvd, magma_indices_1D_bcyclic(), magma_setdevice(), magma_xerbla(), MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, MAGMA_Z_ZERO, magma_zgemm(), magmablasSetKernelStream(), MagmaConjTrans, MagmaNoTrans, max, zgehrd_data::ngpu, and zgehrd_data::streams.

20 {
21 /* -- MAGMA (version 1.4.0) --
22  Univ. of Tennessee, Knoxville
23  Univ. of California, Berkeley
24  Univ. of Colorado, Denver
25  August 2013
26 
27  Purpose
28  =======
29  ZLAHRU is an auxiliary MAGMA routine that is used in ZGEHRD to update
30  the trailing sub-matrices after the reductions of the corresponding
31  panels.
32  See further details below.
33 
34  Arguments
35  =========
36  N (input) INTEGER
37  The order of the matrix A. N >= 0.
38 
39  IHI (input) INTEGER
40  Last row to update. Same as IHI in zgehrd.
41 
42  K (input) INTEGER
43  Number of rows of the matrix Am (see details below)
44 
45  NB (input) INTEGER
46  Block size
47 
48  A (output) COMPLEX_16 array, dimension (LDA,N-K)
49  On entry, the N-by-(N-K) general matrix to be updated. The
50  computation is done on the GPU. After Am is updated on the GPU
51  only Am(1:NB) is transferred to the CPU - to update the
52  corresponding Am matrix. See Further Details below.
53 
54  LDA (input) INTEGER
55  The leading dimension of the array A. LDA >= max(1,N).
56 
57  DA (input/output) COMPLEX_16 array on the GPU, dimension
58  (N,N-K). On entry, the N-by-(N-K) general matrix to be updated.
59  On exit, the 1st K rows (matrix Am) of A are updated by
60  applying an orthogonal transformation from the right
61  Am = Am (I-V T V'), and sub-matrix Ag is updated by
62  Ag = (I - V T V') Ag (I - V T V(NB+1:)' )
63  where Q = I - V T V' represent the orthogonal matrix
64  (as a product of elementary reflectors V) used to reduce
65  the current panel of A to upper Hessenberg form. After Am
66  is updated Am(:,1:NB) is sent to the CPU.
67  See Further Details below.
68 
69  DY (input/workspace) COMPLEX_16 array on the GPU, dimension
70  (N, NB). On entry the (N-K)-by-NB Y = A V. It is used internally
71  as workspace, so its value is changed on exit.
72 
73  DV (input/workspace) COMPLEX_16 array on the GPU, dimension
74  (N, NB). On entry the (N-K)-by-NB matrix V of elementary reflectors
75  used to reduce the current panel of A to upper Hessenberg form.
76  The rest K-by-NB part is used as workspace. V is unchanged on
77  exit.
78 
79  DT (input) COMPLEX_16 array on the GPU, dimension (NB, NB).
80  On entry the NB-by-NB upper trinagular matrix defining the
81  orthogonal Hessenberg reduction transformation matrix for
82  the current panel. The lower triangular part are 0s.
83 
84  DWORK (workspace) COMPLEX_16 array on the GPU, dimension N*NB.
85 
86  Further Details
87  ===============
88  This implementation follows the algorithm and notations described in:
89 
90  S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
91  form through hybrid GPU-based computing," University of Tennessee Computer
92  Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
93  May 24, 2009.
94 
95  The difference is that here Am is computed on the GPU.
96  M is renamed Am, G is renamed Ag.
97  ===================================================================== */
98 
99  #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda)
100  #define dTi( d ) (data->Ti[d])
101  #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv )
102  #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
103  #define dW( d, i, j ) (data->W [d] + (i) + (j)*ldda)
104  #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda)
105 
106  magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
107  magmaDoubleComplex c_one = MAGMA_Z_ONE;
108  magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
109 
110  magma_int_t ngpu = data->ngpu;
111  magma_int_t ldda = data->ldda;
112  magma_int_t ldv = data->ldv;
113  magma_int_t ldvd = data->ldvd;
114 
115  magma_int_t d;
116  magma_int_t dk, dkhi, dknb, dn;
117 
118  magma_int_t info = 0;
119  if (n < 0) {
120  info = -1;
121  } else if (ihi < 0 || ihi > n) {
122  info = -2;
123  } else if (k < 0 || k > n) {
124  info = -3;
125  } else if (nb < 1 || nb > n) {
126  info = -4;
127  } else if (lda < max(1,n)) {
128  info = -6;
129  }
130  if (info != 0) {
131  magma_xerbla( __func__, -(info) );
132  return info;
133  }
134 
135  for( d = 0; d < ngpu; ++d ) {
136  magma_setdevice( d );
137  magmablasSetKernelStream( data->streams[d] );
138 
139  // convert global indices (k) to local indices (dk)
140  magma_indices_1D_bcyclic( nb, ngpu, d, k, ihi, &dk, &dkhi );
141  magma_indices_1D_bcyclic( nb, ngpu, d, k+nb, n, &dknb, &dn );
142 
143  // -----
144  // on right, A := A Q = A - A V T V'
145  // Update Am = Am - Am V T Vd' = Am - Ym Wd', with Wd = Vd T'
146  // Wd = Vd T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)'
147  // Vd and Wd are the portions corresponding to the block cyclic dkstribution
148  magma_zgemm( MagmaNoTrans, MagmaConjTrans, dkhi-dk, nb, nb,
149  c_one, dVd(d, dk, 0), ldvd,
150  dTi(d), nb,
151  c_zero, dW (d, dk, 0), ldda );
152 
153  // Am = Am - Ym Wd' = A(0:k-1, k:ihi-1) - Ym(0:k-1, 0:nb-1) * W(k:ihi-1, 0:nb-1)'
154  magma_zgemm( MagmaNoTrans, MagmaConjTrans, k, dkhi-dk, nb,
155  c_neg_one, dY(d, 0, 0), ldda,
156  dW(d, dk, 0), ldda,
157  c_one, dA(d, 0, dk), ldda );
158 
159  // -----
160  // on right, A := A Q = A - A V T V'
161  // Update Ag = Ag - Ag V T V' = Ag - Yg Wd'
162  // Ag = Ag - Yg Wd' = A(k:ihi-1, nb:ihi-k-1) - Y(k:ihi-1, 0:nb-1) * W(k+nb:ihi-1, 0:nb-1)'
163  magma_zgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, dkhi-dknb, nb,
164  c_neg_one, dY(d, k, 0), ldda,
165  dW(d, dknb, 0), ldda,
166  c_one, dA(d, k, dknb), ldda );
167 
168  // -----
169  // on left, A := Q' A = A - V T' V' A
170  // Ag2 = Ag2 - V T' V' Ag2 = W Yg, with W = V T' and Yg = V' Ag2
171  // Note that Ag is A(k:ihi, nb+1:ihi-k)
172  // while Ag2 is A(k:ihi, nb+1: n -k)
173 
174  // here V and W are the whole matrices, not just block cyclic portion
175  // W = V T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)'
176  // TODO would it be cheaper to compute the whole matrix and
177  // copy the block cyclic portions to another workspace?
178  magma_zgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, nb, nb,
179  c_one, dV (d, k, 0), ldv,
180  dTi(d), nb,
181  c_zero, dW (d, k, 0), ldda );
182 
183  // Z = V(k:ihi-1, 0:nb-1)' * A(k:ihi-1, nb:n-k-1); Z is stored over Y
184  magma_zgemm( MagmaConjTrans, MagmaNoTrans, nb, dn-dknb, ihi-k,
185  c_one, dV(d, k, 0), ldv,
186  dA(d, k, dknb), ldda,
187  c_zero, dY(d, 0, 0), nb );
188 
189  // Ag2 = Ag2 - W Z = A(k:ihi-1, k+nb:n-1) - W(k+nb:n-1, 0:nb-1) * Z(0:nb-1, k+nb:n-1)
190  magma_zgemm( MagmaNoTrans, MagmaNoTrans, ihi-k, dn-dknb, nb,
191  c_neg_one, dW(d, k, 0), ldda,
192  dY(d, 0, 0), nb,
193  c_one, dA(d, k, dknb), ldda );
194  }
195 
196  return 0;
197 }
#define __func__
Definition: common_magma.h:65
magma_int_t ldv
#define MAGMA_Z_NEG_ONE
Definition: magma.h:134
magma_int_t ldvd
int magma_int_t
Definition: magmablas.h:12
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define dW(d, i, j)
void magma_indices_1D_bcyclic(magma_int_t nb, magma_int_t ngpu, magma_int_t dev, magma_int_t j0, magma_int_t j1, magma_int_t *dj0, magma_int_t *dj1)
Definition: auxiliary.cpp:220
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaConjTrans
Definition: magma.h:59
#define dA(d, i, j)
#define MAGMA_Z_ZERO
Definition: magma.h:131
magma_int_t ldda
#define dTi(d)
#define dVd(d, i, j)
#define dV(d, i, j)
magma_queue_t streams[MagmaMaxGPUs]
#define MAGMA_Z_ONE
Definition: magma.h:132
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
#define dY(d, i, j)

Here is the call graph for this function:

Here is the caller graph for this function: