MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dgeqrf_mgpu-trace.cpp File Reference
#include "common_magma.h"
#include <sys/time.h>
#include <assert.h>
Include dependency graph for dgeqrf_mgpu-trace.cpp:

Go to the source code of this file.

Macros

#define MultiGPUs
#define MAX_THREADS   5
#define MAX_EVENTS   1048576
#define core_cpu_event_start(my_core_id)   event_start_time[my_core_id] = get_current_cpu_time(); \
#define core_cpu_event_end(my_core_id)   event_end_time[my_core_id] = get_current_cpu_time(); \
#define core_gpu_event_start(my_core_id, e1, e2)
#define core_gpu_event_end(my_core_id, e1, e2)
#define core_log_event(event, my_core_id)
#define dlA(gpu, a_1, a_2)   ( dlA[gpu]+(a_2)*(ldda) + (a_1))
#define work_ref(a_1)   ( work + (a_1))
#define hwork   ( work + (nb)*(m))
#define hwrk_ref(a_1)   ( local_work + (a_1))
#define lhwrk   ( local_work + (nb)*(m))

Functions

double get_current_cpu_time (void)
int event_num[MAX_THREADS__attribute__ ((aligned(128)))
void dump_trace (int cores_num)
magma_int_t magma_dgeqrf2_mgpu (int num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t ldda, double *tau, magma_int_t *info)

Variables

int log_events = 1

Macro Definition Documentation

#define core_cpu_event_end (   my_core_id)    event_end_time[my_core_id] = get_current_cpu_time(); \

Definition at line 42 of file dgeqrf_mgpu-trace.cpp.

#define core_cpu_event_start (   my_core_id)    event_start_time[my_core_id] = get_current_cpu_time(); \

Definition at line 39 of file dgeqrf_mgpu-trace.cpp.

#define core_gpu_event_end (   my_core_id,
  e1,
  e2 
)
Value:
cudaEventElapsedTime(&ctime, e1, e2); \
event_end_time[my_core_id] = ctime/1000.+dtime; \

Definition at line 49 of file dgeqrf_mgpu-trace.cpp.

#define core_gpu_event_start (   my_core_id,
  e1,
  e2 
)
Value:
cudaEventElapsedTime(&ctime, e1, e2); \
event_start_time[my_core_id] = ctime/1000.+dtime; \

Definition at line 45 of file dgeqrf_mgpu-trace.cpp.

#define core_log_event (   event,
  my_core_id 
)
Value:
event_log[my_core_id][event_num[my_core_id]+0] = my_core_id;\
event_log[my_core_id][event_num[my_core_id]+1] = event_start_time[my_core_id];\
event_log[my_core_id][event_num[my_core_id]+2] = event_end_time[my_core_id];\
event_log[my_core_id][event_num[my_core_id]+3] = (event);\
event_num[my_core_id] += (log_events << 2); \
event_num[my_core_id] &= (MAX_EVENTS-1);

Definition at line 53 of file dgeqrf_mgpu-trace.cpp.

#define dlA (   gpu,
  a_1,
  a_2 
)    ( dlA[gpu]+(a_2)*(ldda) + (a_1))
#define hwork   ( work + (nb)*(m))
#define hwrk_ref (   a_1)    ( local_work + (a_1))
#define lhwrk   ( local_work + (nb)*(m))
#define MAX_EVENTS   1048576

Definition at line 31 of file dgeqrf_mgpu-trace.cpp.

#define MAX_THREADS   5

Definition at line 28 of file dgeqrf_mgpu-trace.cpp.

#define MultiGPUs

Definition at line 13 of file dgeqrf_mgpu-trace.cpp.

#define work_ref (   a_1)    ( work + (a_1))

Function Documentation

int event_num [MAX_THREADS] __attribute__ ( (aligned(128))  )
void dump_trace ( int  cores_num)

Definition at line 61 of file dgeqrf_mgpu-trace.cpp.

References fopen.

{
char trace_file_name[32];
FILE *trace_file;
int event;
int core;
double scale = 100000.0;
double large = 100.0;
sprintf(trace_file_name, "trace.svg");
trace_file = fopen(trace_file_name, "w");
assert(trace_file != NULL);
fprintf(trace_file,
"<?xml version=\"1.0\" standalone=\"no\"?>"
"<svg version=\"1.1\" baseProfile=\"full\" xmlns=\"http://www.w3.org/2000/svg\" "
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns:ev=\"http://www.w3.org/2001/xml-events\" "
">\n"
" <g font-size=\"20\">\n");
for (core = 0; core < cores_num; core++)
for (event = 0; event < event_num[core]; event += 4)
{
int thread = event_log[core][event+0];
double start = event_log[core][event+1];
double end = event_log[core][event+2];
int color = event_log[core][event+3];
start -= event_log[core][2];
end -= event_log[core][2];
/*
fprintf(trace_file,
" "
"<rect x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
"fill=\"#%06x\" stroke=\"#000000\" stroke-width=\"1\"/>\n",
start * scale,
thread * 100.0,
(end - start) * scale,
90.0,
color);
*/
fprintf(trace_file,
" "
"<rect x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
// "fill=\"#%06x\" />\n",
"fill=\"#%06x\" stroke=\"#000000\" stroke-width=\"1\"/>\n",
start * scale,
thread * (large+20.0),
(end - start) * scale,
large,
color);
}
fprintf(trace_file,
" </g>\n"
"</svg>\n");
fclose(trace_file);
}
double get_current_cpu_time ( void  )

Definition at line 19 of file dgeqrf_mgpu-trace.cpp.

{
struct timeval time_val;
gettimeofday(&time_val, NULL);
return (double)(time_val.tv_sec) + (double)(time_val.tv_usec) / 1000000.0;
}
magma_int_t magma_dgeqrf2_mgpu ( int  num_gpus,
magma_int_t  m,
magma_int_t  n,
double **  dlA,
magma_int_t  ldda,
double *  tau,
magma_int_t info 
)

Definition at line 125 of file dgeqrf_mgpu-trace.cpp.

References __func__, core_cpu_event_end, core_cpu_event_start, core_gpu_event_end, core_gpu_event_start, core_log_event, dpanel_to_q(), dq_to_panel(), dump_trace(), dwork, get_current_cpu_time(), hwrk_ref, lapackf77_dgeqrf(), lapackf77_dlarft(), lhwrk, magma_device_sync(), magma_dgetmatrix(), magma_dgetmatrix_async(), magma_dlarfb_gpu(), magma_dmalloc(), magma_dmalloc_host(), magma_dsetmatrix(), magma_dsetmatrix_async(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_event_create(), magma_event_record(), magma_free(), magma_get_dgeqrf_nb(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), magmablas_dgetmatrix_1D_bcyclic(), magmablas_dsetmatrix_1D_bcyclic(), MagmaColumnwise, MagmaColumnwiseStr, MagmaForward, MagmaForwardStr, MagmaLeft, MagmaTrans, MagmaUpper, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
DGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A:
A = Q * R. This is a GPU interface of the routine.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
On entry, the M-by-N matrix dA.
On exit, the elements on and above the diagonal of the array
contain the min(M,N)-by-N upper trapezoidal matrix R (R is
upper triangular if m >= n); the elements below the diagonal,
with the array TAU, represent the orthogonal matrix Q as a
product of min(m,n) elementary reflectors (see Further
Details).
LDDA (input) INTEGER
The leading dimension of the array dA. LDDA >= max(1,M).
To benefit from coalescent memory accesses LDDA must be
dividable by 16.
TAU (output) DOUBLE_PRECISION array, dimension (min(M,N))
The scalar factors of the elementary reflectors (see Further
Details).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
Further Details
===============
The matrix Q is represented as a product of elementary reflectors
Q = H(1) H(2) . . . H(k), where k = min(m,n).
Each H(i) has the form
H(i) = I - tau * v * v'
where tau is a real scalar, and v is a real vector with
v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
and tau in TAU(i).
===================================================================== */
#define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1))
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
#define hwrk_ref(a_1) ( local_work + (a_1))
#define lhwrk ( local_work + (nb)*(m))
double *dwork[4], *panel[4], *local_work;
magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows;
magma_int_t nbmin, nx, ib, nb;
magma_int_t lhwork, lwork;
magma_int_t cdevice;
magma_getdevice(&cdevice);
float ctime, dtime;
int panel_gpunum=-1, i_local, n_local[4], la_gpu, displacement;
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = min(m,n);
if (k == 0)
return *info;
displacement = n * nb;
lwork = max((m+n+64) * nb,n*m+n*nb);
lhwork = lwork - (m)*nb;
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
if (MAGMA_SUCCESS != magma_dmalloc( &(dwork[i]), (n + ldda)*nb )) {
return *info;
}
}
/* Set the number of local n for each GPU */
for(i=0; i<num_gpus; i++){
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
}
if (MAGMA_SUCCESS != magma_dmalloc_host( &local_work, lwork )) {
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_free( dwork[i] );
}
return *info;
}
static cudaStream_t streaml[4][2];
cudaEvent_t start[4], stop[4][10];
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_queue_create( &streaml[i][0] );
magma_queue_create( &streaml[i][1] );
magma_event_create( &start[i] );
for(j=0; j<10; j++)
magma_event_create( &stop[i][j] );
magma_event_record( start[i], 0 );
}
core_cpu_event_end(num_gpus);
core_log_event(0x666666, num_gpus);
for(j=0; j<num_gpus; j++){
magma_event_record(stop[j][0], 0);
magma_event_record(stop[j][1], 0);
core_gpu_event_start(j, start[j], stop[j][0]);
core_gpu_event_end(j, start[j], stop[j][1]);
core_log_event(0x666666, j);
}
nbmin = 2;
nx = nb;
ldwork = m;
lddwork= n;
// magmablasSetKernelStream(streaml[0][0]);
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb)
{
/* Set the GPU number that holds the current panel */
panel_gpunum = (i/nb)%num_gpus;
/* Set the local index where the current panel is */
i_local = i/(nb*num_gpus)*nb;
ib = min(k-i, nb);
rows = m -i;
/* Send current panel to the CPU */
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
dlA(panel_gpunum, i, i_local), ldda,
hwrk_ref(i), ldwork, streaml[panel_gpunum][1] );
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead
application to the trailing matrix */
la_gpu = panel_gpunum;
/* only the GPU that has next panel is done look-ahead */
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
magma_event_record(stop[la_gpu][0], 0);
m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, old_i, i_local+old_ib), ldda,
dwork[la_gpu]+old_ib, lddwork);
magma_event_record(stop[la_gpu][1], 0);
la_gpu = ((i-nb)/nb)%num_gpus;
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
magma_dsetmatrix_async( old_ib, old_ib,
hwrk_ref(old_i), ldwork,
panel[la_gpu], ldda, streaml[la_gpu][0] );
}
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
magma_queue_sync( streaml[panel_gpunum][1] );
lapackf77_dgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info);
// Form the triangular factor of the block reflector
// H = H(i) H(i+1) . . . H(i+ib-1)
&rows, &ib,
hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib);
dpanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );
core_cpu_event_end(num_gpus);
core_log_event(0x006680, num_gpus);
// Send the current panel back to the GPUs
// Has to be done with asynchronous copies
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
if (j == panel_gpunum)
panel[j] = dlA(j, i, i_local);
else
panel[j] = dwork[j]+displacement;
hwrk_ref(i), ldwork,
panel[j], ldda, streaml[j][0] );
}
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
magma_queue_sync( streaml[j][0] );
}
core_cpu_event_end(num_gpus);
core_log_event(0xDD0000, num_gpus);
//=================== take the values of all counters
/* Restore the panel */
dq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );
core_cpu_event_end(num_gpus);
core_log_event(0x006680, num_gpus);
if (i>0){
for(j=0; j<num_gpus; j++){
core_gpu_event_start(j, start[j], stop[j][2]);
core_gpu_event_end(j, start[j], stop[j][3]);
core_log_event(0x880000, j);
core_gpu_event_start(j, start[j], stop[j][3]);
core_gpu_event_end(j, start[j], stop[j][0]);
core_log_event(0xDD0000, j);
}
}
magma_setdevice(panel_gpunum);
core_gpu_event_start(panel_gpunum, start[panel_gpunum], stop[panel_gpunum][0]);
core_gpu_event_end(panel_gpunum, start[panel_gpunum], stop[panel_gpunum][1]);
core_log_event(0x660000, panel_gpunum);
if (i + ib < n)
{
/* Send the T matrix to the GPU.
Has to be done with asynchronous copies */
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
lhwrk, ib,
dwork[j], lddwork, streaml[j][0] );
}
if (i+nb < k-nx)
{
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left;
This is update for the next panel; part of the look-ahead */
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
//magma_queue_sync( streaml[j][0] );
magma_event_record(stop[j][2], 0);
if (j==la_gpu)
rows, ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork);
else if (j<=panel_gpunum)
rows, n_local[j]-i_local-ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork);
else
rows, n_local[j]-i_local, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork);
magma_event_record(stop[j][3], 0);
}
}
else {
/* do the entire update as we exit and there would be no lookahead */
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
magma_setdevice(la_gpu);
#endif
rows, n_local[la_gpu]-i_loc, ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork);
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
hwrk_ref(i), ldwork,
dlA(panel_gpunum, i, i_local), ldda );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
magma_free( dwork[j] );
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
lhwork = lwork - rows*ib;
panel_gpunum = (panel_gpunum+1)%num_gpus;
int i_loc = (i)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
magma_setdevice(panel_gpunum);
#endif
if (i == 0) {
magmablas_dgetmatrix_1D_bcyclic(m, n, dlA, ldda, lhwrk, m, num_gpus, nb);
} else {
magma_dgetmatrix( rows, ib,
dlA(panel_gpunum, i, i_loc), ldda,
lhwrk, rows );
}
lhwork = lwork - rows*ib;
lapackf77_dgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info);
if (i == 0) {
magmablas_dsetmatrix_1D_bcyclic(m, n, lhwrk, m, dlA, ldda, num_gpus, nb);
} else {
magma_dsetmatrix( rows, ib,
lhwrk, rows,
dlA(panel_gpunum, i, i_loc), ldda );
}
}
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
magma_queue_destroy( streaml[i][0] );
magma_queue_destroy( streaml[i][1] );
}
magma_setdevice(cdevice);
dump_trace(num_gpus+1);
return *info;
} /* magma_dgeqrf2_mgpu */

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

int log_events = 1

Definition at line 37 of file dgeqrf_mgpu-trace.cpp.