PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
compute_d.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define plasma_ddesc_alloc(descA, mb, nb, lm, ln, i, j, m, n, free)
#define plasma_dooplap2tile(descA, A, mb, nb, lm, ln, i, j, m, n, free)
#define plasma_diplap2tile(descA, A, mb, nb, lm, ln, i, j, m, n)
#define plasma_dooptile2lap(descA, A, mb, nb, lm, ln)
#define plasma_diptile2lap(descA, A, mb, nb, lm, ln)

Functions

void plasma_pdgeadd (plasma_context_t *plasma)
void plasma_pdgelqf (plasma_context_t *plasma)
void plasma_pdgemm (plasma_context_t *plasma)
void plasma_pdgeqrf (plasma_context_t *plasma)
void plasma_pdgerbb (plasma_context_t *plasma)
void plasma_pdgetmi2 (plasma_context_t *plasma)
void plasma_pdgetrf_incpiv (plasma_context_t *plasma)
void plasma_pdlacpy (plasma_context_t *plasma)
void plasma_pdlag2s (plasma_context_t *plasma)
void plasma_pdlange (plasma_context_t *plasma)
void plasma_pdlansy (plasma_context_t *plasma)
void plasma_pdpack (plasma_context_t *plasma)
void plasma_pdplgsy (plasma_context_t *plasma)
void plasma_pdplrnt (plasma_context_t *plasma)
void plasma_pdpotrf (plasma_context_t *plasma)
void plasma_pdshift (plasma_context_t *plasma)
void plasma_pdsymm (plasma_context_t *plasma)
void plasma_pdsyrk (plasma_context_t *plasma)
void plasma_pdsyr2k (plasma_context_t *plasma)
void plasma_pdtrmm (plasma_context_t *plasma)
void plasma_pdtrsm (plasma_context_t *plasma)
void plasma_pdtrsmpl (plasma_context_t *plasma)
void plasma_pdtrsmrv (plasma_context_t *plasma)
void plasma_pdorglq (plasma_context_t *plasma)
void plasma_pdorgqr (plasma_context_t *plasma)
void plasma_pdorgqrrh (plasma_context_t *plasma)
void plasma_pdormlq (plasma_context_t *plasma)
void plasma_pdormqr (plasma_context_t *plasma)
void plasma_pdunpack (plasma_context_t *plasma)
int plasma_dshift (plasma_context_t *plasma, int m, int n, double *A, int nprob, int me, int ne, int L, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgeadd_quark (double alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdbarrier_tl2pnl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdbarrier_pnl2tl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdbarrier_tl2row_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdbarrier_row2tl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgelqf_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgelqfrh_quark (PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgemm_quark (PLASMA_enum transA, PLASMA_enum transB, double alpha, PLASMA_desc A, PLASMA_desc B, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgeqrf_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgeqrfrh_quark (PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgerbh_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgerbb_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgerbbrh_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgetmi2_quark (PLASMA_enum idep, PLASMA_enum odep, PLASMA_enum storev, int m, int n, int mb, int nb, double *A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgetrf_incpiv_quark (PLASMA_desc A, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgetrf_reclap_quark (PLASMA_desc A, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgetrf_rectil_quark (PLASMA_desc A, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdsygst_quark (PLASMA_enum itype, PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdsyrbt_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdgbrdb_quark (PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdsbrdt_quark (PLASMA_enum uplo, PLASMA_desc A, double *D, double *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlacpy_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlag2s_quark (PLASMA_desc A, PLASMA_desc SB, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlange_quark (PLASMA_enum norm, PLASMA_desc A, double *work, double *result, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlansy_quark (PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc A, double *work, double *result, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlaset_quark (PLASMA_enum uplo, double alpha, double beta, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlaset2_quark (PLASMA_enum uplo, double alpha, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlaswp_quark (PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlaswpc_quark (PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdlauum_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdplgsy_quark (double bump, PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdplrnt_quark (PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdpotrf_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdshift_quark (int, int, int, double *, int *, int, int, PLASMA_sequence *, PLASMA_request *)
void plasma_pdsymm_quark (PLASMA_enum side, PLASMA_enum uplo, double alpha, PLASMA_desc A, PLASMA_desc B, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdsyrk_quark (PLASMA_enum uplo, PLASMA_enum trans, double alpha, PLASMA_desc A, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdsyr2k_quark (PLASMA_enum uplo, PLASMA_enum trans, double alpha, PLASMA_desc A, PLASMA_desc B, double beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdtrmm_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, double alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdtrsm_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, double alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdtrsmpl_quark (PLASMA_desc A, PLASMA_desc B, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdtrsmrv_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, double alpha, PLASMA_desc A, PLASMA_desc W, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdtrtri_quark (PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorgbr_quark (PLASMA_enum side, PLASMA_desc A, PLASMA_desc O, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorgbrrh_quark (PLASMA_enum side, PLASMA_desc A, PLASMA_desc O, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorgqr_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorgqrrh_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorglq_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorglqrh_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdorgtr_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdormqr_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdormqrrh_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdormlq_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdormlqrh_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)

Detailed Description

PLASMA auxiliary routines PLASMA is a software package provided by Univ. of Tennessee, Univ. of California Berkeley and Univ. of Colorado Denver

Version:
2.4.5
Author:
Jakub Kurzak
Mathieu Faverge
Date:
2010-11-15 d Tue Nov 22 14:35:45 2011

Definition in file compute_d.h.


Macro Definition Documentation

#define plasma_ddesc_alloc (   descA,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
  n,
  free 
)
Value:
descA = plasma_desc_init( \
PlasmaRealDouble, (mb), (nb), ((mb)*(nb)), \
(m), (n), (i), (j), (m), (n)); \
if ( plasma_desc_mat_alloc( &(descA) ) ) { \
plasma_error( __func__, "plasma_shared_alloc() failed"); \
{free;}; \
}

Macro for matrix conversion / Lapack interface

Definition at line 20 of file compute_d.h.

#define plasma_diplap2tile (   descA,
  A,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
 
)
Value:
descA = plasma_desc_init( \
PlasmaRealDouble, (mb), (nb), ((mb)*(nb)), \
(lm), (ln), (i), (j), (m), (n)); \
descA.mat = A; \
PLASMA_dgecfi_Async((lm), (ln), (A), PlasmaCM, (mb), (nb), \
PlasmaCCRB, (mb), (nb), sequence, &request);

Definition at line 47 of file compute_d.h.

#define plasma_diptile2lap (   descA,
  A,
  mb,
  nb,
  lm,
  ln 
)
Value:
PLASMA_dgecfi_Async((lm), (ln), (A), PlasmaCCRB, (mb), (nb), \
PlasmaCM, (mb), (nb), sequence, &request);

Definition at line 65 of file compute_d.h.

#define plasma_dooplap2tile (   descA,
  A,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
  n,
  free 
)
Value:
descA = plasma_desc_init( \
PlasmaRealDouble, (mb), (nb), ((mb)*(nb)), \
(lm), (ln), (i), (j), (m), (n)); \
if ( plasma_desc_mat_alloc( &(descA) ) ) { \
plasma_error( __func__, "plasma_shared_alloc() failed"); \
{free;}; \
} \
plasma_parallel_call_5( \
double*, (A), \
int, (lm), \
PLASMA_desc, (descA), \
PLASMA_sequence*, sequence, \
PLASMA_request*, &request);

Definition at line 30 of file compute_d.h.

#define plasma_dooptile2lap (   descA,
  A,
  mb,
  nb,
  lm,
  ln 
)
Value:
PLASMA_desc, (descA), \
double*, (A), \
int, (lm), \
PLASMA_sequence*, sequence, \
PLASMA_request*, &request);

Definition at line 57 of file compute_d.h.


Function Documentation

int plasma_dshift ( plasma_context_t plasma,
int  m,
int  n,
double *  A,
int  nprob,
int  me,
int  ne,
int  L,
PLASMA_sequence sequence,
PLASMA_request request 
)

Declarations of internal sequential functions


plasma_dgetmi2 Implementation of inplace transposition based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. This algorithm shift some cycles to transpose the matrix.

Parameters:
[in]mNumber of rows of matrix A
[in]nNumber of columns of matrix A
[in,out]AMatrix of size L*m*n
[in]nprobNumber of parallel and independant problems
[in]meNumber of rows of the problem
[in]neNumber of columns in the problem
[in]LSize of chunk to use for transformation

Definition at line 60 of file pdshift.c.

References GKK_BalanceLoad(), GKK_getLeaderNbr(), L, minloc(), plasma_dynamic_call_9, PLASMA_ERR_ILLEGAL_VALUE, plasma_error(), PLASMA_GRPSIZE, plasma_pdshift(), plasma_request_fail(), PLASMA_SCHEDULING, plasma_shared_alloc(), plasma_shared_free(), PLASMA_SIZE, plasma_static_call_9, PLASMA_STATIC_SCHEDULING, PLASMA_SUCCESS, and PlasmaInteger.

{
int *leaders = NULL;
int ngrp, thrdbypb, thrdtot, nleaders;
/* Check Plasma context */
thrdtot = PLASMA_SIZE;
thrdbypb = PLASMA_GRPSIZE;
ngrp = thrdtot/thrdbypb;
/* check input */
if( (nprob * me * ne * L) != (m * n) ) {
plasma_error(__func__, "problem size does not match matrix size");
/*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( thrdbypb > thrdtot ) {
plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( (thrdtot % thrdbypb) != 0 ) {
plasma_error(__func__, "number of thread per problem must divide the total number of thread");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
/* quick return */
if( (me < 2) || (ne < 2) || (nprob < 1) ) {
}
GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
nleaders *= 3;
int *Tp = NULL;
int i, ipb;
int owner;
Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
ipb = 0;
/* First part with coarse parallelism */
if (nprob > ngrp) {
ipb = (nprob / ngrp)*ngrp;
/* loop over leader */
if (thrdbypb > 1) {
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdbypb, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=0; i< (nprob/ngrp); i++) {
int, me,
int, ne,
int, L,
double*, &(A[i*ngrp*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdbypb,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
/* Second part with fine parallelism */
if (ipb < nprob) {
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
if (thrdtot > 1) {
/* loop over leader */
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdtot, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=ipb; i<nprob; i++) {
int, me,
int, ne,
int, L,
double*, &(A[i*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdtot,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
plasma_shared_free(plasma, Tp);
}
/* Dynamic scheduling */
else {
int, me,
int, ne,
int, L,
double*, A,
int *, leaders,
int, nleaders,
int, nprob,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
free(leaders);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdbarrier_pnl2tl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from panels to tiles

Definition at line 61 of file pdbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), INOUT, INPUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (n = 0; n < A.nt; n++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(0, n), INOUT,
0);
for (m = 0; m < A.mt; m++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(0, n), INPUT,
sizeof(double)*A.mb*A.nb, A(m, n), INOUT,
0);
}
}
}

Here is the call graph for this function:

void plasma_pdbarrier_row2tl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from panels to tiles

Definition at line 128 of file pdbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), INOUT, INPUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(m, 0), INOUT,
0);
for (n = 0; n < A.nt; n++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(m, 0), INPUT,
sizeof(double)*A.mb*A.nb, A(m, n), INOUT,
0);
}
}
}

Here is the call graph for this function:

void plasma_pdbarrier_tl2pnl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from tiles to panels

Definition at line 25 of file pdbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), GATHERV, INOUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (n = 0; n < A.nt; n++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(0, n), INOUT,
0);
for (m = 0; m < A.mt; m++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(0, n), INOUT | GATHERV,
sizeof(double)*A.mb*A.nb, A(m, n), INOUT,
0);
}
/* Protection to next GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(0, n), INOUT,
0);
}
}

Here is the call graph for this function:

void plasma_pdbarrier_tl2row_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from tiles to panels

Definition at line 92 of file pdbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), GATHERV, INOUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(m, 0), INOUT,
0);
for (n = 0; n < A.nt; n++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(m, 0), INOUT | GATHERV,
sizeof(double)*A.mb*A.nb, A(m, n), INOUT,
0);
}
/* Protection to next GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*A.mb*A.nb, A(m, 0), INOUT,
0);
}
}

Here is the call graph for this function:

void plasma_pdgbrdb_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
double *  D,
double *  E,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel Reduction from BAND Bidiagonal to the final condensed form - dynamic scheduler

Definition at line 26 of file pdgbrdb.c.

References A, C, DEP, plasma_desc_t::dtyp, plasma_desc_t::lm, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::n, plasma_context_self(), plasma_element_size(), plasma_sequence_flush(), plasma_shared_alloc(), plasma_shared_free(), PLASMA_SUCCESS, PlasmaInteger, PlasmaLower, PlasmaRealDouble, plasma_context_struct::quark, QUARK_Barrier(), QUARK_CORE_dbrdalg(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, TASK_SEQUENCE, TAU, and V.

{
#ifdef COMPLEX
static double dzero = (double) 0.0;
double absztmp;
#endif
static double zone = (double) 1.0;
static double zzero = (double) 0.0;
double *C, *S;
double ztmp, V, TAU;
int M, N, NB, MINMN, INgrsiz, INthgrsiz, BAND;
int myid, grsiz, shift=3, stt, st, ed, stind, edind;
int blklastind, colpt, PCOL, ACOL, MCOL;
int stepercol,mylastid,grnb,grid;
int *DEP,*MAXID;
int i, j, m;
int thgrsiz, thgrnb, thgrid, thed;
size_t eltsize = plasma_element_size(A.dtyp);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
M = A.m;
N = A.n;
NB = A.mb;
MINMN = min(M,N);
/* Quick return */
if ( MINMN == 0 ){
return;
}
if ( NB == 0 ) {
memset(D, 0, MINMN *sizeof(double));
memset(E, 0, (MINMN-1)*sizeof(double));
#ifdef COMPLEX
for (i=0; i<MINMN; i++)
D[i] = fabs(*A(i,i));
#else
for (i=0; i<MINMN; i++)
D[i] = *A(i,i);
#endif
return;
}
/*
* Barrier is used because the bulge have to wait until
* the reduction to band has been finish.
* otherwise, I can remove this BARRIER when I integrate
* the function dependencies link inside the reduction to
* band. Keep in mind the case when NB=1, where no bulge-chasing.
*/
/***************************************************************/
QUARK_Barrier(plasma->quark);
/***************************************************************/
/*
* Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
* Make diagonal and superdiagonal elements real, storing them in
* D and E. if PlasmaLower, first transform lower bidiagonal form
* to upper bidiagonal by applying plane rotations/ Householder
* from the left, overwriting superdiagonal elements then make
* elements real of the resulting upper Bidiagonal. if PlasmaUpper
* then make its elements real. For Q, PT: ZSCAL should be done
* in case of WANTQ.
*/
if ( NB == 1 ) {
memset(D, 0, MINMN*sizeof(double));
memset(E, 0, (MINMN-1)*sizeof(double));
for (i=0; i<(MINMN-1); i++)
{
/* generate Householder to annihilate a(i+1,i) and create a(i,i+1) */
V = *A((i+1), i);
*A((i+1), i) = zzero;
LAPACKE_dlarfg_work( 2, A(i, i), &V, 1, &TAU);
/* apply Left*/
TAU = (TAU);
ztmp = TAU*V;
V = (V);
*A(i, i+1) = - V * TAU * (*A(i+1, i+1));
*A(i+1, i+1) = *(A(i+1, i+1)) * (zone - V * ztmp);
}
}
/* PlasmaLower or PlasmaUpper, both are now upper */
/* Make diagonal and superdiagonal elements real,
* storing them in D and E
*/
#ifdef COMPLEX
ztmp = zone;
for (i=0; i<MINMN; i++)
{
ztmp = *A(i, i) * (ztmp);
absztmp = fabs(ztmp);
D[i] = absztmp; /* diag value */
if(absztmp != dzero)
ztmp = (double) (ztmp / absztmp);
else
ztmp = zone;
if(i<(MINMN-1)) {
ztmp = *A(i, (i+1)) * (ztmp);
absztmp = fabs(ztmp);
E[i] = absztmp; /* upper off-diag value */
if(absztmp != dzero)
ztmp = (double) (ztmp / absztmp);
else
ztmp = zone;
}
}
#else
for (i=0; i < MINMN-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
#endif
return;
}
/*
* Case MINMN<NB ==> matrix is very small and better to call lapack ZGETRD.
*
* Use fact that one row of block is stored the same way than in LAPACK
* Doesn't work if M > NB because of tile storage
*/
if ( MINMN <= 0 )
{
double *work, *taup, *tauq;
int info, ldwork = N*N;
work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
taup = (double *) plasma_shared_alloc(plasma, MINMN, PlasmaRealDouble);
tauq = (double *) plasma_shared_alloc(plasma, MINMN, PlasmaRealDouble);
info = LAPACKE_dgebrd_work(LAPACK_COL_MAJOR, M, N,
A(0,0), A.lm, D, E, taup, tauq, work, ldwork);
plasma_shared_free(plasma, (void*) work);
plasma_shared_free(plasma, (void*) taup);
plasma_shared_free(plasma, (void*) tauq);
if( info == 0 )
sequence->status = PLASMA_SUCCESS;
else
plasma_sequence_flush(plasma->quark, sequence, request, info);
return;
}
/* General case NB > 1 && N > NB */
DEP = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
MAXID = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
C = (double *) plasma_shared_alloc(plasma, MINMN, PlasmaRealDouble);
S = (double *) plasma_shared_alloc(plasma, MINMN, PlasmaRealDouble);
memset(MAXID,0,(MINMN+1)*sizeof(int));
/***************************************************************************
* START BULGE CHASING CODE
**************************************************************************/
/*
* Initialisation of local parameter. those parameter should be
* input or tuned parameter.
*/
INgrsiz = 1;
if( NB > 160 ) {
INgrsiz = 2;
}
else if( NB > 100 ) {
if( MINMN < 5000 )
INgrsiz = 2;
else
INgrsiz = 4;
} else {
INgrsiz = 6;
}
INthgrsiz = MINMN;
BAND = 0;
grsiz = INgrsiz;
thgrsiz = INthgrsiz;
if( grsiz == 0 ) grsiz = 6;
if( thgrsiz == 0 ) thgrsiz = MINMN;
i = shift/grsiz;
stepercol = i*grsiz == shift ? i:i+1;
i = (MINMN-2)/thgrsiz;
thgrnb = i*thgrsiz == (MINMN-2) ? i:i+1;
for (thgrid = 1; thgrid<=thgrnb; thgrid++){
stt = (thgrid-1)*thgrsiz+1;
thed = min( (stt + thgrsiz -1), (MINMN-2));
for (i = stt; i <= MINMN-2; i++){
ed=min(i,thed);
if(stt>ed)break;
for (m = 1; m <=stepercol; m++){
st=stt;
for (j = st; j <=ed; j++){
/* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */
/* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */
/* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
mylastid = myid+grsiz-1;
PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group*/
MAXID[j] = myid;
PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
grnb = PCOL/grsiz;
grid = grnb*grsiz == PCOL ? grnb:grnb+1;
PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column.*/
ACOL = myid-grsiz;
if(myid==1)ACOL=0;
MCOL = myid;
plasma->quark, &task_flags,
uplo, MINMN, NB,
&A, C, S, i, j, m, grsiz, BAND,
DEP(PCOL), DEP(ACOL), DEP(MCOL) );
if(mylastid%2 ==0){
blklastind = (mylastid/2)*NB+1+j-1;
}else{
colpt = ((mylastid+1)/2)*NB + 1 +j -1 ;
stind = colpt-NB+1;
edind = min(colpt,MINMN);
if( (stind>=edind-1) && (edind==MINMN) )
blklastind=MINMN;
else
blklastind=0;
}
if(blklastind >= (MINMN-1)) stt=stt+1;
} /* END for j=st:ed */
} /* END for m=1:stepercol */
} /* END for i=1:MINMN-2 */
} /* END for thgrid=1:thgrnb */
/*
* Barrier used only for now, to be sure that everything
* is done before copying the D and E and free workspace.
* this will be removed later when D and E are directly filled
* during the bulge process.
*/
QUARK_Barrier(plasma->quark);
plasma_shared_free(plasma, (void*) DEP);
plasma_shared_free(plasma, (void*) MAXID);
plasma_shared_free(plasma, (void*) C);
plasma_shared_free(plasma, (void*) S);
/*
* STORE THE RESULTING diagonal/off-diagonal in D AND E
*/
memset(D, 0, MINMN*sizeof(double));
memset(E, 0, (MINMN-1)*sizeof(double));
/*
* If PlasmaLower, first transform lower bidiagonal form
* to upper bidiagonal by applying plane rotations/ Householder
* from the left, overwriting superdiagonal elements then make
* elements real of the resulting upper Bidiagonal. if PlasmaUpper
* then make its elements real.
* For Q, PT: ZSCAL should be done in case of WANTQ.
*/
for (i=0; i<(MINMN-1); i++)
{
/* generate Householder to annihilate a(i+1,i) and create a(i,i+1)*/
V = *A((i+1), i);
*A((i+1), i) = zzero;
LAPACKE_dlarfg_work( 2, A(i, i), &V, 1, &TAU);
/* apply Left */
TAU = (TAU);
ztmp = TAU*V;
V = (V);
*A(i, (i+1)) = - V * TAU * (*A((i+1), (i+1)));
*A((i+1), (i+1)) = (*A((i+1), (i+1))) * (zone - V * ztmp);
}
}
/* PlasmaLower or PlasmaUpper, both are upper, now*/
/* Make diagonal and superdiagonal elements real,
* storing them in D and E
*/
/* In complex case, the element off diagonal element are
* not necessary real and we have to make off-diagonal
* elements real and copy them to E.
* When using HouseHolder elimination,
* the ZLARFG give us a real as output so, all the
* diagonal/off-diagonal element except the last one are already
* real and thus we need only to take the abs of the last
* one.
* */
#ifdef COMPLEX
ztmp =zone;
for (i=0; i < MINMN-1; i++) {
D[i] = ( *A(i, i) );
/*
* Alternative for Householder case, all diag/superdiag
* are real except the last diag and superdiag, where we
* have to take the abs
*/
if(i<(MINMN-2))
E[i] = (*A(i, i+1));
else
E[i] = fabs( *A(i, i+1)); /* last upper value is complex */
}
D[i] = fabs( *A(i, i) );
#else
for (i=0; i < MINMN-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
#endif
} /* END FUNCTION */

Here is the call graph for this function:

void plasma_pdgeadd ( plasma_context_t plasma)

Declarations of parallel functions (static scheduling) - alphabetical order

Definition at line 23 of file pdgeadd.c.

References A, B, BLKLDD, CORE_dgeadd(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, and plasma_sequence_t::status.

{
double alpha;
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int next_m;
int next_n;
int ldam, ldbm;
plasma_unpack_args_5(alpha, A, B, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt;
}
X = m == A.mt-1 ? A.m-A.mb*m : A.nb;
Y = n == A.nt-1 ? A.n-A.nb*n : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
CORE_dgeadd(X, Y, alpha, A(m, n), ldam, B(m, n), ldbm);
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

void plasma_pdgeadd_quark ( double  alpha,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_sequence sequence,
PLASMA_request request 
)

Declarations of parallel functions (dynamic scheduling) - alphabetical order

Definition at line 72 of file pdgeadd.c.

References B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dgeadd(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
for (n = 0; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
alpha, A(m, n), ldam,
B(m, n), ldbm);
}
}
}

Here is the call graph for this function:

void plasma_pdgelqf ( plasma_context_t plasma)

Parallel tile LQ factorization - static scheduling

Definition at line 24 of file pdgelqf.c.

References A, BLKLDD, CORE_dgelqt(), CORE_dormlq(), CORE_dtslqt(), CORE_dtsmlq(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaRight, PlasmaTrans, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib = PLASMA_IB;
double *work, *tau;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
tau = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (m >= A.mt) {
k++;
m = m-A.mt+k;
}
n = k;
while (k < min(A.mt, A.nt) && m < A.mt) {
next_m = m;
next_n = n;
next_k = k;
next_n++;
if (next_n == A.nt) {
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_k < min(A.nt, A.mt)) {
next_k++;
next_m = next_m-A.mt+next_k;
}
next_n = next_k;
}
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (m == k) {
if (n == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
tau, work);
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(k, n, k-1);
tempkm, tempnn, ib,
A(k, k), ldak,
A(k, n), ldak,
T(k, n), T.mb,
tau, work);
ss_cond_set(k, n, k);
}
}
else {
if (n == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(m, k, k-1);
tempmm, tempkn, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
A(m, k), ldam,
work, T.nb);
}
else {
ss_cond_wait(k, n, k);
ss_cond_wait(m, n, k-1);
tempmm, A.nb, tempmm, tempnn, A.nb, ib,
A(m, k), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb,
work, T.nb);
ss_cond_set(m, n, k);
}
}
m = next_m;
n = next_n;
k = next_k;
}
plasma_private_free(plasma, work);
plasma_private_free(plasma, tau);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgelqf_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LQ factorization - dynamic scheduling

Definition at line 137 of file pdgelqf.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaRight, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dgelqt(), QUARK_CORE_dormlq(), QUARK_CORE_dtslqt(), QUARK_CORE_dtsmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
A(m, k), ldam);
}
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, ib, T.nb,
A(k, k), ldak,
A(k, n), ldak,
T(k, n), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
A(m, k), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgelqfrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LQ factorization (reduction Householder) - dynamic scheduling

Definition at line 25 of file pdgelqfrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaRight, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dgelqt(), QUARK_CORE_dormlq(), QUARK_CORE_dtslqt(), QUARK_CORE_dtsmlq(), QUARK_CORE_dttlqt(), QUARK_CORE_dttmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int N, RD;
int ldak, ldam;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm, tempNn);
plasma->quark, &task_flags,
tempkm, tempNn, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempNn, tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
A(m, N), ldam);
}
for (n = N+1; n < min(N+BS, A.nt); n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, ib, T.nb,
A(k, N), ldak,
A(k, n), ldak,
T(k, n), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempnn, tempkm, ib, T.nb,
A(m, N), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
for (RD = BS; RD < A.nt-k; RD *= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempNRDn, ib, T.nb,
A (k, N ), ldak,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m );
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempNRDn, tempkm, ib, T.nb,
A (m, N ), ldam,
A (m, N+RD), ldam,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
}
}

Here is the call graph for this function:

void plasma_pdgemm ( plasma_context_t plasma)

Parallel tile matrix-matrix multiplication - static scheduling

Definition at line 24 of file pdgemm.c.

References A, B, BLKLDD, C, CORE_dgemm(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_9, PlasmaNoTrans, and plasma_sequence_t::status.

{
PLASMA_enum transA;
PLASMA_enum transB;
double alpha;
double beta;
PLASMA_sequence *sequence;
PLASMA_request *request;
int K, X, Y;
int k, m, n;
int next_m;
int next_n;
int ldam, ldak, ldbn, ldbk, ldcm;
double zbeta;
double zone = (double)1.0;
plasma_unpack_args_9(transA, transB, alpha, A, B, beta, C, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= C.mt && n < C.nt) {
n++;
m = m-C.mt;
}
while (n < C.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= C.mt && next_n < C.nt) {
next_n++;
next_m = next_m - C.mt;
}
X = m == C.mt-1 ? C.m - m*C.mb : C.mb;
Y = n == C.nt-1 ? C.n - n*C.nb : C.nb;
ldcm = BLKLDD(C, m);
/*
* A: PlasmaNoTrans / B: PlasmaNoTrans
*/
if (transA == PlasmaNoTrans) {
ldam = BLKLDD(A, m);
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.nt; k++) {
K = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(m, k), ldam,
B(k, n), ldbk,
zbeta, C(m, n), ldcm);
}
}
/*
* A: PlasmaNoTrans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.nt; k++) {
K = k == A.nt-1 ? A.n-k*A.nb : A.nb;
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(m, k), ldam,
B(n, k), ldbn,
zbeta, C(m, n), ldcm);
}
}
}
/*
* A: Plasma[Conj]Trans / B: PlasmaNoTrans
*/
else {
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.mt; k++) {
K = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(k, m), ldak,
B(k, n), ldbk,
zbeta, C(m, n), ldcm);
}
}
/*
* A: Plasma[Conj]Trans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.mt; k++) {
K = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(k, m), ldak,
B(n, k), ldbn,
zbeta, C(m, n), ldcm);
}
}
}
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgemm_quark ( PLASMA_enum  transA,
PLASMA_enum  transB,
double  alpha,
PLASMA_desc  A,
PLASMA_desc  B,
double  beta,
PLASMA_desc  C,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile matrix-matrix multiplication - dynamic scheduling

Definition at line 149 of file pdgemm.c.

References B, BLKLDD, C, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaNoTrans, plasma_context_struct::quark, QUARK_CORE_dgemm(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n, k;
int ldam, ldak, ldbn, ldbk, ldcm;
int tempmm, tempnn, tempkn, tempkm;
double zbeta;
double zone = (double)1.0;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < C.mt; m++) {
tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
ldcm = BLKLDD(C, m);
for (n = 0; n < C.nt; n++) {
tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
/*
* A: PlasmaNoTrans / B: PlasmaNoTrans
*/
if (transA == PlasmaNoTrans) {
ldam = BLKLDD(A, m);
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.nt; k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkn, A.mb,
alpha, A(m, k), ldam, /* lda * Z */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
/*
* A: PlasmaNoTrans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.nt; k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkn, A.mb,
alpha, A(m, k), ldam, /* lda * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
}
/*
* A: Plasma[Conj]Trans / B: PlasmaNoTrans
*/
else {
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkm, A.mb,
alpha, A(k, m), ldak, /* lda * X */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
/*
* A: Plasma[Conj]Trans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkm, A.mb,
alpha, A(k, m), ldak, /* lda * X */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgeqrf ( plasma_context_t plasma)

Parallel tile QR factorization - static scheduling

Definition at line 24 of file pdgeqrf.c.

References A, BLKLDD, CORE_dgeqrt(), CORE_dormqr(), CORE_dtsmqr(), CORE_dtsqrt(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaLeft, PlasmaTrans, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int tempkm, tempkn, tempnn, tempmm;
int ib = PLASMA_IB;
double *work, *tau;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
tau = (double*)plasma_private_alloc(plasma, A.nb, A.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (n >= A.nt) {
k++;
n = n-A.nt+k;
}
m = k;
while (k < min(A.mt, A.nt) && n < A.nt) {
next_n = n;
next_m = m;
next_k = k;
next_m++;
if (next_m == A.mt) {
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
next_k++;
next_n = next_n-A.nt+next_k;
}
next_m = next_k;
}
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (n == k) {
if (m == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
tau, work);
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(m, k, k-1);
tempmm, tempkn, ib,
A(k, k), ldak,
A(m, k), ldam,
T(m, k), T.mb,
tau, work);
ss_cond_set(m, k, k);
}
}
else {
if (m == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(k, n, k-1);
tempkm, tempnn, tempkm, ib,
A(k, k), ldak,
T(k, k), T.mb,
A(k, n), ldak,
work, T.nb);
}
else {
ss_cond_wait(m, k, k);
ss_cond_wait(m, n, k-1);
A.nb, tempnn, tempmm, tempnn, A.nb, ib,
A(k, n), ldak,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb,
work, ib);
ss_cond_set(m, n, k);
}
}
n = next_n;
m = next_m;
k = next_k;
}
plasma_private_free(plasma, work);
plasma_private_free(plasma, tau);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgeqrf_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile QR factorization - dynamic scheduling

Definition at line 137 of file pdgeqrf.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dgeqrt(), QUARK_CORE_dormqr(), QUARK_CORE_dtsmqr(), QUARK_CORE_dtsqrt(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempnn, tempmm;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkm, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
A(k, n), ldak);
}
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, T.nb,
A(k, k), ldak,
A(m, k), ldam,
T(m, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.mb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
A(k, n), ldak,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgeqrfrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile QR factorization (reduction Householder) - dynamic scheduling

Definition at line 25 of file pdgeqrfrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dgeqrt(), QUARK_CORE_dormqr(), QUARK_CORE_dtsmqr(), QUARK_CORE_dtsqrt(), QUARK_CORE_dttmqr(), QUARK_CORE_dttqrt(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int M, RD;
int ldaM, ldam, ldaMRD;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
for (M = k; M < A.mt; M += BS) {
tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb;
tempkmin = min(tempMm, tempkn);
ldaM = BLKLDD(A, M);
plasma->quark, &task_flags,
tempMm, tempkn, ib, T.nb,
A(M, k), ldaM,
T(M, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempMm, tempnn, tempkmin, ib, T.nb,
A(M, k), ldaM,
T(M, k), T.mb,
A(M, n), ldaM);
}
for (m = M+1; m < min(M+BS, A.mt); m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, T.nb,
A(M, k), ldaM,
A(m, k), ldam,
T(m, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
A(M, n), ldaM,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb);
}
}
}
for (RD = BS; RD < A.mt-k; RD *= 2) {
for (M = k; M+RD < A.mt; M += 2*RD) {
tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
ldaM = BLKLDD(A, M );
ldaMRD = BLKLDD(A, M+RD);
plasma->quark, &task_flags,
tempMRDm, tempkn, ib, T.nb,
A (M , k), ldaM,
A (M+RD, k), ldaMRD,
T2(M+RD, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb,
A (M, n), ldaM,
A (M+RD, n), ldaMRD,
A (M+RD, k), ldaMRD,
T2(M+RD, k), T.mb);
}
}
}
}
}

Here is the call graph for this function:

void plasma_pdgerbb ( plasma_context_t plasma)

Parallel tile BAND Bidiagonal Reduction - dynamic scheduler Could be optimized by using the algorithms from Trefethen book

WARNING: do never call this function because ormqr and unmlq are not implementing all the cases required in static.

Definition at line 26 of file pdgerbb.c.

References A, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_desc_submatrix(), plasma_pdgelqf(), plasma_pdgeqrf(), plasma_pdormlq(), plasma_pdormqr(), plasma_static_call_4, plasma_static_call_7, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaLeft, PlasmaRight, PlasmaTrans, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k;
int tempkm, tempkn;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
if (A.m >= A.n){
for (k = 0; k < A.nt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
if (k+1 < A.nt){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
}
else{
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
if (k+1 < A.mt){
tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgerbb_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile BAND Bidiagonal Reduction - dynamic scheduler Could be optimized by using the algorithms from Trefethen book

Definition at line 127 of file pdgerbb.c.

References plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_desc_submatrix(), plasma_pdgelqf_quark(), plasma_pdgeqrf_quark(), plasma_pdormlq_quark(), plasma_pdormqr_quark(), PlasmaLeft, PlasmaRight, and PlasmaTrans.

{
int k;
int tempkm, tempkn;
if (A.m >= A.n){
for (k = 0; k < A.nt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
if (k+1 < A.nt){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
sequence, request);
}
}
}
else{
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
sequence, request);
if (k+1 < A.mt){
tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
}
}
}
}

Here is the call graph for this function:

void plasma_pdgerbbrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdgerbh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdgetmi2 ( plasma_context_t plasma)

plasma_pdgetmi2 - realises nprob independant transpositions. Each subproblem is a tile of mb-by-nb elements. This function use an extra space of PLASMA_SIZE*(mb*nb).

Parameters:
[in]plasmaPlasma context to which this call belong to.
See also:
plasma_pdgetmi2_quark

Definition at line 40 of file pdgetmi2.c.

References A, CORE_dgetrip(), plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_10, PlasmaRealDouble, plasma_sequence_t::status, and storev.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
double *A, *Al, *work;
PLASMA_enum storev, idep, odep;
int i, m, n, mb, nb, nprob;
int size, bsiz;
plasma_unpack_args_10(idep, odep, storev, m, n, mb, nb, A, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
/* quick return */
if( (mb < 2) || (nb < 2) ) {
return ;
}
size = PLASMA_SIZE;
bsiz = mb*nb;
nprob = ( m / mb ) * ( n / nb );
work = (double*)plasma_private_alloc(plasma, mb*nb, PlasmaRealDouble);
for (i=PLASMA_RANK; i<nprob; i+=size) {
Al = &(A[ i * bsiz]);
CORE_dgetrip(mb, nb, Al, work);
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

void plasma_pdgetmi2_quark ( PLASMA_enum  idep,
PLASMA_enum  odep,
PLASMA_enum  storev,
int  m,
int  n,
int  mb,
int  nb,
double *  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

plasma_pdgetmi2_quark - realises nprob independant transpositions. Each subproblem is a tile of mb-by-nb elements. This function use an extra space of PLASMA_SIZE*(mb*nb). This is a maximum in case of dynamic scheduling.

Parameters:
[in]idepPlasmaIPT_Nodep: No fake dependencies are added. PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. PlasmaIPT_All: A gatherv is added on the whole matrix.
[in]odepPlasmaIPT_Nodep: No fake dependencies are added. PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. PlasmaIPT_All: A gatherv is added on the whole matrix.
[in]storevPlasmaColumnWise: Data stored in column major. PlasmaRowWise: Data stored in row major.
[in]mNumber of row of A if tiles are sorted in column major format, number of columns otherwise.
[in]nNumber of columns of A if tiles are sorted in column major format, number of rows otherwise.
[in]mbNumber of rows in each individual subproblem if storev == PlasmaColumnWise, number of columns otherwise. mmb must be 0.
[in]nbNumber of columns in each individual subproblem if storev == PlasmaColumnWise, number of rows otherwise. nnb must be 0.
[in,out]AMatrix of size m*n.
[in]sequenceIdentifies the sequence of function calls that this call belongs to (for completion checks and exception handling purposes).
[out]requestIdentifies this function call (for exception handling purposes).
See also:
plasma_pdgetmi2

Definition at line 128 of file pdgetmi2.c.

References GATHERV, INOUT, INPUT, plasma_context_self(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaIPT_All, PlasmaIPT_NoDep, PlasmaIPT_Panel, plasma_context_struct::quark, QUARK_CORE_dgetrip(), QUARK_CORE_dgetrip_f1(), QUARK_CORE_dgetrip_f2(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
double *Al, *Ap;
int i, j, nprob, mt, nt;
int bsiz, psiz, size;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
/* quick return */
if( (mb < 2) || (nb < 2) ) {
return ;
}
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
bsiz = mb*nb;
psiz = m*nb;
mt = ( m / mb );
nt = ( n / nb );
} else {
psiz = n*mb;
mt = ( n / nb );
nt = ( m / mb );
}
size = m*n;
switch ( idep ) {
/*
* Dependencies on each panel as input
*/
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INOUT|GATHERV);
}
}
break;
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
QUARK_CORE_dgetrip_f2(plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, size, INPUT,
A, size, INOUT|GATHERV);
}
}
break;
default:
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INPUT);
}
}
}
break;
/*
* Dependency on all the matrix as input
*/
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
A, size, INPUT,
Ap, psiz, INOUT|GATHERV);
}
}
break;
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_dgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INOUT|GATHERV);
}
break;
default:
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_dgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INPUT);
}
}
break;
/*
* No Dependencies as input
*/
default:
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INOUT|GATHERV);
}
}
break;
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_dgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INOUT|GATHERV);
}
break;
default:
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_dgetrip(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz);
}
}
}
}

Here is the call graph for this function:

void plasma_pdgetrf_incpiv ( plasma_context_t plasma)

Parallel tile LU factorization - static scheduling

Definition at line 25 of file pdgetrf_incpiv.c.

References A, BLKLDD, CORE_dgessm(), CORE_dgetrf_incpiv(), CORE_dssssm(), CORE_dtstrf(), plasma_desc_t::dtyp, IPIV, L, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, ss_abort, ss_aborted, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, and plasma_sequence_t::status.

{
int *IPIV;
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int info;
int tempkn, tempkm, tempmm, tempnn;
int ib = PLASMA_IB;
double *work;
plasma_unpack_args_5(A, L, IPIV, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (double*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (n >= A.nt) {
k++;
n = n-A.nt+k;
}
m = k;
while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) {
next_n = n;
next_m = m;
next_k = k;
next_m++;
if (next_m == A.mt) {
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
next_k++;
next_n = next_n-A.nt+next_k;
}
next_m = next_k;
}
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (n == k) {
if (m == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
IPIV(k, k), &info);
if (info != 0 && m == A.mt-1) {
plasma_request_fail(sequence, request, info + A.nb*k);
}
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(m, k, k-1);
tempmm, tempkn, ib, A.nb,
A(k, k), ldak,
A(m, k), ldam,
L(m, k), L.mb,
IPIV(m, k),
work, L.nb, &info);
if (info != 0 && m == A.mt-1) {
plasma_request_fail(sequence, request, info + A.nb*k);
}
ss_cond_set(m, k, k);
}
}
else {
if (m == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(k, n, k-1);
tempkm, tempnn, tempkm, ib,
IPIV(k, k),
A(k, k), ldak,
A(k, n), ldak);
}
else {
ss_cond_wait(m, k, k);
ss_cond_wait(m, n, k-1);
A.nb, tempnn, tempmm, tempnn, A.nb, ib,
A(k, n), ldak,
A(m, n), ldam,
L(m, k), L.mb,
A(m, k), ldam,
IPIV(m, k));
ss_cond_set(m, n, k);
}
}
n = next_n;
m = next_m;
k = next_k;
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdgetrf_incpiv_quark ( PLASMA_desc  A,
PLASMA_desc  L,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LU factorization - dynamic scheduling

Definition at line 143 of file pdgetrf_incpiv.c.

References A, BLKLDD, IPIV, L, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dgessm(), QUARK_CORE_dgetrf_incpiv(), QUARK_CORE_dssssm(), QUARK_CORE_dtstrf(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, L.nb,
A(k, k), ldak, IPIV(k, k),
sequence, request,
k == A.mt-1, A.nb*k);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkm, ib, L.nb,
IPIV(k, k),
A(k, k), ldak,
A(k, n), ldak);
}
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, L.nb,
A(k, k), ldak,
A(m, k), ldam,
L(m, k), L.mb,
IPIV(m, k),
sequence, request,
m == A.mt-1, A.nb*k);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb,
A(k, n), ldak,
A(m, n), ldam,
L(m, k), L.mb,
A(m, k), ldam,
IPIV(m, k));
}
}
}
}

Here is the call graph for this function:

void plasma_pdgetrf_reclap_quark ( PLASMA_desc  A,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdgetrf_rectil_quark ( PLASMA_desc  A,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdlacpy ( plasma_context_t plasma)

Definition at line 23 of file pdlacpy.c.

References A, B, BLKLDD, CORE_dlacpy(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_sequence_t::status, and uplo.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int next_m;
int next_n;
int ldam, ldbm;
plasma_unpack_args_5(uplo, A, B, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
switch (uplo) {
/*
* PlasmaUpper
*/
m = 0;
while (n >= A.nt) {
m++;
n = n - A.nt + m;
}
while (m < A.mt) {
next_m = m;
next_n = n;
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_m < A.mt) {
next_m++;
next_n = next_n - A.nt + next_m;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
m == n ? uplo : PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
/*
* PlasmaLower
*/
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt + n;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m - A.mt + next_n;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
m == n ? uplo : PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
/*
* PlasmaUpperLower
*/
case PlasmaUpperLower:
default:
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m - A.mt;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdlacpy_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 153 of file pdlacpy.c.

References B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_dlacpy(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
switch (uplo) {
/*
* PlasmaUpper
*/
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
if (m < A.nt) {
Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, m), ldam,
B(m, m), ldbm);
}
for (n = m+1; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
break;
/*
* PlasmaLower
*/
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
if (m < A.nt) {
Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, m), ldam,
B(m, m), ldbm);
}
for (n = 0; n < min(m, A.nt); n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
break;
/*
* PlasmaUpperLower
*/
default:
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
for (n = 0; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
}
}

Here is the call graph for this function:

void plasma_pdlag2s ( plasma_context_t plasma)

Definition at line 25 of file pdlag2s.c.

References A, BLKLDD, CORE_dlag2s(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, SB, and plasma_sequence_t::status.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int next_m;
int next_n;
int ldam, ldbm;
int info = PLASMA_SUCCESS;
plasma_unpack_args_4(A, SB, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt;
}
X = m == A.mt-1 ? A.m-A.mb*m : A.nb;
Y = n == A.nt-1 ? A.n-A.nb*n : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(SB, m);
CORE_dlag2s(X, Y, A(m, n), ldam, SB(m, n), ldbm, &info);
if (info != 0)
plasma_request_fail(sequence, request, info);
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

void plasma_pdlag2s_quark ( PLASMA_desc  A,
PLASMA_desc  SB,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 77 of file pdlag2s.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dlag2s(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, SB, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for(m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(SB, m);
for(n = 0; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
SB(m, n), ldbm,
sequence, request);
}
}
}

Here is the call graph for this function:

void plasma_pdlange ( plasma_context_t plasma)

Definition at line 24 of file pdlange.c.

References A, BLKLDD, CORE_dasum(), CORE_dlange(), plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, max, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, norm, plasma_desc_t::nt, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, plasma_unpack_args_6, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, ss_cond_set, ss_cond_wait, ss_finalize, and ss_init.

{
double *work;
double *result;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam;
int step, lrank;
int X, X1, X2, Y, Y1, Y2;
double* lwork;
double normtmp, normtmp2;
plasma_unpack_args_6(norm, A, work, result, sequence, request);
*result = 0.0;
if (PLASMA_RANK == 0)
memset(work, 0, PLASMA_SIZE*sizeof(double));
switch (norm) {
/*
* PlasmaMaxNorm
*/
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt;
}
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldam = BLKLDD(A, m);
CORE_dlange(PlasmaMaxNorm, X, Y, A(m, n, X1, Y1, ldam), ldam, NULL, &normtmp);
if (normtmp > work[PLASMA_RANK])
work[PLASMA_RANK] = normtmp;
m = next_m;
n = next_n;
}
break;
/*
* PlasmaOneNorm
*/
normtmp2 = 0.0;
lwork = (double*)plasma_private_alloc(plasma, A.nb, PlasmaRealDouble);
while (n < A.nt) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
memset(lwork, 0, A.nb*sizeof(double));
for (m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
X, Y,
A(m, n, X1, Y1, ldam), ldam,
lwork);
}
CORE_dlange(PlasmaMaxNorm, Y, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaInfNorm
*/
normtmp2 = 0.0;
lwork = (double*)plasma_private_alloc(plasma, A.mb, PlasmaRealDouble);
while (m < A.mt) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
memset(lwork, 0, A.mb*sizeof(double));
for (n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
X, Y,
A(m, n, X1, Y1, ldam), ldam,
lwork);
}
CORE_dlange(PlasmaMaxNorm, X, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
if (norm != PlasmaFrobeniusNorm) {
step = 1;
lrank = PLASMA_RANK;
while ( (lrank%2 == 0) && (PLASMA_RANK+step < PLASMA_SIZE) ) {
ss_cond_wait(PLASMA_RANK+step, 0, step);
work[PLASMA_RANK] = max(work[PLASMA_RANK], work[PLASMA_RANK+step]);
lrank = lrank >> 1;
step = step << 1;
}
if (PLASMA_RANK > 0) {
while( lrank != 0 ) {
if (lrank%2 == 1) {
lrank = 0;
} else {
lrank = lrank >> 1;
step = step << 1;
}
}
}
if (PLASMA_RANK == 0)
*result = work[0];
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdlange_quark ( PLASMA_enum  norm,
PLASMA_desc  A,
double *  work,
double *  result,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 202 of file pdlange.c.

References BLKLDD, plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_shared_alloc(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_dasum_f1(), QUARK_CORE_dlange(), QUARK_CORE_dlange_f1(), QUARK_CORE_free(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, X1, X2, Y, Y1, Y2;
int ldam;
int m, n;
int szeW;
double* lwork;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
*result = 0.0;
switch ( norm ) {
/*
* PlasmaMaxNorm
*/
szeW = A.mt*A.nt;
lwork = (double*)plasma_shared_alloc(plasma, szeW, PlasmaRealDouble);
memset(lwork, 0, szeW*sizeof(double));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
0, &(lwork[A.mt*n+m]),
lwork, szeW);
}
}
plasma->quark, &task_flags,
lwork, A.mt, szeW,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, szeW*sizeof(double));
break;
/*
* PlasmaOneNorm
*/
lwork = (double*)plasma_shared_alloc(plasma, (A.n+1), PlasmaRealDouble);
memset(lwork, 0, (A.n+1)*sizeof(double));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[n*A.nb+1]), A.nb,
lwork, A.n);
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.n+1, 1,
lwork, 1, A.n+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.n+1)*sizeof(double));
break;
/*
* PlasmaInfNorm
*/
lwork = (double*)plasma_shared_alloc(plasma, (A.m+1), PlasmaRealDouble);
memset(lwork, 0, (A.m+1)*sizeof(double));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.m+1, 1,
lwork, 1, A.m+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.m+1)*sizeof(double));
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
}

Here is the call graph for this function:

void plasma_pdlansy ( plasma_context_t plasma)

Definition at line 24 of file pdlansy.c.

References A, BLKLDD, CORE_dasum(), CORE_dlange(), CORE_dlansy(), plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, max, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, norm, plasma_desc_t::nt, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, plasma_unpack_args_7, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaLower, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, and uplo.

{
double *work;
double *result;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam, ldan;
int step, lrank;
int X, X1, X2, Y, Y1, Y2;
double* lwork;
double normtmp, normtmp2;
plasma_unpack_args_7(norm, uplo, A, work, result, sequence, request);
*result = 0.0;
if (PLASMA_RANK == 0)
memset(work, 0, PLASMA_SIZE*sizeof(double));
switch (norm) {
/*
* PlasmaMaxNorm
*/
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt+n;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt+next_n;
}
if (m == n) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
CORE_dlansy(PlasmaMaxNorm, uplo, X, A(m, n, X1, X1, ldam), ldam, NULL, &normtmp);
}
else {
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldam = BLKLDD(A, m);
CORE_dlange(PlasmaMaxNorm, X, Y, A(m, n, X1, Y1, ldam), ldam, NULL, &normtmp);
}
/*
* PlasmaUpper
*/
else {
X1 = n == 0 ? A.i %A.mb : 0;
X2 = n == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = m == 0 ? A.j %A.nb : 0;
Y2 = m == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldan = BLKLDD(A, n);
CORE_dlange(PlasmaMaxNorm, X, Y, A(n, m, X1, Y1, ldan), ldan, NULL, &normtmp);
}
}
if (normtmp > work[PLASMA_RANK])
work[PLASMA_RANK] = normtmp;
m = next_m;
n = next_n;
}
break;
/*
* PlasmaOneNorm / PlasmaInfNorm
*/
normtmp2 = 0.0;
lwork = (double*)plasma_private_alloc(plasma, A.mb, PlasmaRealDouble);
while (m < A.mt) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
memset(lwork, 0, A.mb*sizeof(double));
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for (n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
CORE_dasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, Y1, ldam), ldam, lwork);
}
CORE_dasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
for (n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
ldan = BLKLDD(A, n);
CORE_dasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, 0, X1, ldan), ldan, lwork);
}
}
/*
* PlasmaUpper
*/
else {
for (n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
CORE_dasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, Y1, X1, A.nb), A.nb, lwork);
}
CORE_dasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
for ( n =m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
CORE_dasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, 0, ldam), ldam, lwork);
}
}
CORE_dlange(PlasmaMaxNorm, X, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaFrobeniusNorm
*/
default:;
}
if (norm != PlasmaFrobeniusNorm) {
step = 1;
lrank = PLASMA_RANK;
while ( (lrank%2 == 0) && (PLASMA_RANK+step < PLASMA_SIZE) ) {
ss_cond_wait(PLASMA_RANK+step, 0, step);
work[PLASMA_RANK] = max(work[PLASMA_RANK], work[PLASMA_RANK+step]);
lrank = lrank >> 1;
step = step << 1;
}
if (PLASMA_RANK > 0) {
while( lrank != 0 ) {
if (lrank%2 == 1) {
lrank = 0;
} else {
lrank = lrank >> 1;
step = step << 1;
}
}
}
if (PLASMA_RANK == 0)
*result = work[0];
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdlansy_quark ( PLASMA_enum  norm,
PLASMA_enum  uplo,
PLASMA_desc  A,
double *  work,
double *  result,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 219 of file pdlansy.c.

References BLKLDD, plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_shared_alloc(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaLower, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_dasum_f1(), QUARK_CORE_dlange(), QUARK_CORE_dlange_f1(), QUARK_CORE_dlansy_f1(), QUARK_CORE_free(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, X1, X2, Y, Y1;
int ldam;
int m, n;
int szeW, pos;
double* lwork;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
*result = 0.0;
switch ( norm ) {
/*
* PlasmaMaxNorm
*/
szeW = A.mt*(A.mt+1)/2;
pos = 0;
lwork = (double*)plasma_shared_alloc(plasma, szeW, PlasmaRealDouble);
memset(lwork, 0, szeW*sizeof(double));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
A(m, m, X1, X1, ldam), ldam, ldam*X,
0, &(lwork[pos]),
lwork, szeW);
pos++;
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for(n=0; n<m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
0, &(lwork[pos]),
lwork, szeW);
pos++;
}
}
/*
* PlasmaUpper
*/
else {
for(n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
0, &(lwork[pos]),
lwork, szeW);
pos++;
}
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, szeW, 1,
lwork, 1, szeW,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, szeW*sizeof(double));
break;
/*
* PlasmaOneNorm / PlasmaInfNorm
*/
lwork = (double *)plasma_shared_alloc(plasma, A.m+1, PlasmaRealDouble);
memset(lwork, 0, (A.m+1)*sizeof(double));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
A(m, m, X1, X1, ldam), ldam, ldam*X,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for(n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[n*A.mb+1]), A.mb,
lwork, A.m);
}
}
/*
* PlasmaUpper
*/
else {
for(n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
&(lwork[n*A.mb+1]), A.mb,
lwork, A.m);
}
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.m+1, 1,
lwork, 1, A.m+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.m+1)*sizeof(double));
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
}

Here is the call graph for this function:

void plasma_pdlaset2_quark ( PLASMA_enum  uplo,
double  alpha,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel initializztion a 2-D array A to ALPHA on the offdiagonals.

Definition at line 22 of file pdlaset2.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_dlaset2(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int i, j;
int ldai, ldaj;
int tempim;
int tempjm, tempjn;
int minmn = min(A.mt, A.nt);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if (uplo == PlasmaLower) {
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaLower, tempjm, tempjn, alpha,
A(j, j), ldaj);
for (i = j+1; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
}
else if (uplo == PlasmaUpper) {
for (j = 1; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
for (i = 0; i < min(j, A.mt); i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpper, tempjm, tempjn, alpha,
A(j, j), ldaj);
}
}
else {
for (i = 0; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
for (j = 0; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
}
}

Here is the call graph for this function:

void plasma_pdlaset_quark ( PLASMA_enum  uplo,
double  alpha,
double  beta,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel initialization a 2-D array A to BETA on the diagonal and ALPHA on the offdiagonals.

Definition at line 22 of file pdlaset.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_dlaset(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int i, j;
int ldai, ldaj;
int tempim;
int tempjm, tempjn;
int minmn = min(A.mt, A.nt);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if (uplo == PlasmaLower) {
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaLower, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
for (i = j+1; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
}
else if (uplo == PlasmaUpper) {
for (j = 1; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
for (i = 0; i < min(j, A.mt); i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpper, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
}
}
else {
for (i = 0; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
for (j = 0; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpperLower, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
}
}
}

Here is the call graph for this function:

void plasma_pdlaswp_quark ( PLASMA_desc  B,
int *  IPIV,
int  inc,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile row interchanges - dynamic scheduling

Definition at line 23 of file pdlaswp.c.

References B, IPIV, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_desc_submatrix(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dlaswp_ontile(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int tempi, tempm, tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if ( inc > 0 )
{
for (m = 0; m < B.mt; m++) {
tempi = m * B.mb;
tempm = B.m - tempi;
tempmm = m == B.mt-1 ? tempm : B.mb;
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) );
}
}
}
else
{
for (m = B.mt-1; m > -1; m--) {
tempi = m * B.mb;
tempm = B.m - tempi;
tempmm = m == B.mt-1 ? tempm : B.mb;
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) );
}
}
}
}

Here is the call graph for this function:

void plasma_pdlaswpc_quark ( PLASMA_desc  B,
int *  IPIV,
int  inc,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile column interchanges - dynamic scheduling

Definition at line 23 of file pdlaswpc.c.

References B, IPIV, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_desc_submatrix(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dlaswpc_ontile(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int tempj, tempn, tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if ( inc > 0 )
{
for (n = 0; n < B.nt; n++) {
tempj = n * B.nb;
tempn = B.n - tempj;
tempnn = n == B.nt-1 ? tempn : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m - m * B.mb : B.mb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, m*B.mb, tempj, tempmm, tempn),
B(m, n), 1, tempnn, IPIV(n), inc, B(m, B.nt-1) );
}
}
}
else
{
for (n = B.nt-1; n > -1; n--) {
tempj = n * B.nb;
tempn = B.n - tempj;
tempnn = n == B.nt-1 ? tempn : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m - m * B.mb : B.mb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, m*B.mb, tempj, tempmm, tempn),
B(m, n), 1, tempnn, IPIV(n), inc, B(m, 0) );
}
}
}
}

Here is the call graph for this function:

void plasma_pdlauum_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel UU' or L'L operation - dynamic scheduling

Definition at line 23 of file pdlauum.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLeft, PlasmaLower, PlasmaNonUnit, PlasmaNoTrans, PlasmaRight, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dgemm(), QUARK_CORE_dlauum(), QUARK_CORE_dsyrk(), QUARK_CORE_dtrmm(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int k, m, n;
int ldam;
int tempkm, tempmm, tempnn;
double zone = (double)1.0;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for(n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
1.0, A(m, n), ldam,
1.0, A(n, n), A.mb);
for(k = n+1; k < m; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
plasma->quark, &task_flags,
tempkm, tempnn, tempmm, A.mb,
zone, A(m, k), ldam,
A(m, n), ldam,
zone, A(k, n), A.mb);
}
}
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempmm, tempnn, A.mb,
zone, A(m, m), ldam,
A(m, n), ldam);
}
plasma->quark, &task_flags,
tempmm,
A.mb, A(m, m), ldam);
}
}
/*
* PlasmaUpper
*/
else {
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
1.0, A(n, m), A.mb,
1.0, A(n, n), A.mb);
for (k = n+1; k < m; k++){
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
plasma->quark, &task_flags,
tempnn, tempkm, tempmm, A.mb,
zone, A(n, m), A.mb,
A(k, m), A.mb,
zone, A(n, k), A.mb);
}
}
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
zone, A(m, m), ldam,
A(n, m), A.mb);
}
plasma->quark, &task_flags,
tempmm,
A.mb, A(m, m), ldam);
}
}
}

Here is the call graph for this function:

void plasma_pdorgbr_quark ( PLASMA_enum  side,
PLASMA_desc  A,
PLASMA_desc  O,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdorgbrrh_quark ( PLASMA_enum  side,
PLASMA_desc  A,
PLASMA_desc  O,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdorglq ( plasma_context_t plasma)

Here is the caller graph for this function:

void plasma_pdorglq_quark ( PLASMA_desc  A,
PLASMA_desc  Q,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel construction of Q using tile V (application to identity) - dynamic scheduling

Definition at line 25 of file pdorglq.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaNoTrans, PlasmaRight, Q, plasma_context_struct::quark, QUARK_CORE_dormlq(), QUARK_CORE_dtsmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldqm;
int tempnn, tempmm, tempkmin, tempkn;
int tempAkm, tempAkn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = min(A.mt, A.nt)-1; k >= 0; k--) {
tempAkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempAkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempkmin = min( tempAkn, tempAkm );
tempkn = k == Q.nt-1 ? Q.n-k*Q.nb : Q.nb;
ldak = BLKLDD(A, k);
for (n = Q.nt-1; n > k; n--) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
for (m = 0; m < Q.mt; m++) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldqm = BLKLDD(Q, m);
plasma->quark, &task_flags,
tempmm, Q.nb, tempmm, tempnn, tempAkm, ib, T.nb,
Q(m, k), ldqm,
Q(m, n), ldqm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
for (m = 0; m < Q.mt; m++) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldqm = BLKLDD(Q, m);
plasma->quark, &task_flags,
tempmm, tempkn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
Q(m, k), ldqm);
}
}
}

Here is the call graph for this function:

void plasma_pdorglqrh_quark ( PLASMA_desc  A,
PLASMA_desc  Q,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel construction of Q using tile V (application to identity; reduction Householder) - dynamic scheduling

Definition at line 25 of file pdorglqrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaNoTrans, PlasmaRight, Q, plasma_context_struct::quark, QUARK_CORE_dormlq(), QUARK_CORE_dtsmlq(), QUARK_CORE_dttmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int K, N, RD, lastRD;
int ldak;
int ldqm;
int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
K = min(A.mt, A.nt);
for (k = K-1; k >= 0; k--) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
lastRD = 0;
for (RD = BS; RD < A.nt-k; RD *= 2)
lastRD = RD;
for (RD = lastRD; RD >= BS; RD /= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
for (m = 0; m < Q.mt; m++) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldqm = BLKLDD(Q, m );
plasma->quark, &task_flags,
tempmm, Q.nb, tempmm, tempNRDn,
tempkm, ib, T.nb,
Q (m, N ), ldqm,
Q (m, N+RD), ldqm,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm, tempNn);
for (n = min(N+BS, A.nt)-1; n > N; n--) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
for (m = 0; m < Q.mt; m++) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldqm = BLKLDD(Q, m);
plasma->quark, &task_flags,
tempmm, Q.nb, tempmm, tempnn,
tempkm, ib, T.nb,
Q(m, N), ldqm,
Q(m, n), ldqm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
for (m = 0; m < Q.mt; m++) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldqm = BLKLDD(Q, m);
plasma->quark, &task_flags,
tempmm, tempNn,
tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
Q(m, N), ldqm);
}
}
}
}

Here is the call graph for this function:

void plasma_pdorgqr ( plasma_context_t plasma)

Here is the caller graph for this function:

void plasma_pdorgqr_quark ( PLASMA_desc  A,
PLASMA_desc  Q,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel construction of Q using tile V (application to identity) - dynamic scheduling

Definition at line 25 of file pdorgqr.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaNoTrans, Q, plasma_context_struct::quark, QUARK_CORE_dormqr(), QUARK_CORE_dtsmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldqk, ldam, ldqm;
int tempmm, tempnn, tempkmin, tempkm;
int tempAkm, tempAkn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = min(A.mt, A.nt)-1; k >= 0; k--) {
tempAkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempAkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempkmin = min( tempAkn, tempAkm );
tempkm = k == Q.mt-1 ? Q.m-k*Q.mb : Q.mb;
ldak = BLKLDD(A, k);
ldqk = BLKLDD(Q, k);
for (m = Q.mt - 1; m > k; m--) {
tempmm = m == Q.mt-1 ? Q.m-m*Q.mb : Q.mb;
ldam = BLKLDD(A, m);
ldqm = BLKLDD(Q, m);
for (n = 0; n < Q.nt; n++) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
plasma->quark, &task_flags,
Q.mb, tempnn, tempmm, tempnn, tempAkn, ib, T.nb,
Q(k, n), ldqk,
Q(m, n), ldqm,
A(m, k), ldam,
T(m, k), T.mb);
}
}
for (n = 0; n < Q.nt; n++) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
Q(k, n), ldqk);
}
}
}

Here is the call graph for this function:

void plasma_pdorgqrrh ( plasma_context_t plasma)

Here is the caller graph for this function:

void plasma_pdorgqrrh_quark ( PLASMA_desc  A,
PLASMA_desc  Q,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel construction of Q using tile V (application to identity; reduction Householder) - dynamic scheduling

Definition at line 27 of file pdorgqrrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaNoTrans, Q, plasma_context_struct::quark, QUARK_CORE_dormqr(), QUARK_CORE_dtsmqr(), QUARK_CORE_dttmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int K, M, RD, lastRD;
int ldaM, ldam, ldaMRD;
int ldbM, ldbm, ldbMRD;
int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
K = min(A.mt, A.nt);
for (k = K-1; k >= 0; k--) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
lastRD = 0;
for (RD = BS; RD < A.mt-k; RD *= 2)
lastRD = RD;
for (RD = lastRD; RD >= BS; RD /= 2) {
for (M = k; M+RD < A.mt; M += 2*RD) {
tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
ldbM = BLKLDD(Q, M );
ldbMRD = BLKLDD(Q, M+RD);
ldaMRD = BLKLDD(A, M+RD);
for (n = 0; n < Q.nt; n++) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempMRDm, tempnn,
tempkn, ib, T.nb,
Q (M, n), ldbM,
Q (M+RD, n), ldbMRD,
A (M+RD, k), ldaMRD,
T2(M+RD, k), T.mb);
}
}
}
for (M = k; M < A.mt; M += BS) {
tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb;
tempkmin = min(tempMm, tempkn);
ldaM = BLKLDD(A, M);
ldbM = BLKLDD(Q, M);
for (m = min(M+BS, A.mt)-1; m > M; m--) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldbm = BLKLDD(Q, m);
ldam = BLKLDD(A, m);
for (n = 0; n < Q.nt; n++) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempmm, tempnn,
tempkn, ib, T.nb,
Q(M, n), ldbM,
Q(m, n), ldbm,
A(m, k), ldam,
T(m, k), T.mb);
}
}
for (n = 0; n < Q.nt; n++) {
tempnn = n == Q.nt-1 ? Q.n-n*Q.nb : Q.nb;
plasma->quark, &task_flags,
tempMm, tempnn,
tempkmin, ib, T.nb,
A(M, k), ldaM,
T(M, k), T.mb,
Q(M, n), ldbM);
}
}
}
}

Here is the call graph for this function:

void plasma_pdorgtr_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_desc  Q,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pdormlq ( plasma_context_t plasma)

Parallel application of Q using tile V - LQ factorization - static scheduling

Definition at line 26 of file pdormlq.c.

References A, B, BLKLDD, CORE_dormlq(), CORE_dtsmlq(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_ERR_NOT_SUPPORTED, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_7, PlasmaLeft, PlasmaTrans, side, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, T, and trans.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldbk, ldbm;
int tempmm, tempnn, tempkm, tempkmin;
int minMT, minM;
int ib = PLASMA_IB;
double *work;
plasma_unpack_args_7(side, trans, A, B, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
if (side != PlasmaLeft) {
return;
}
if (trans != PlasmaTrans) {
return;
}
work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
ss_init(B.mt, B.nt, min(A.mt, A.nt));
if (A.m > A.n) {
minM = A.n;
minMT = A.nt;
} else {
minM = A.m;
minMT = A.mt;
}
k = minMT-1;
while (n >= B.nt) {
k--;
n = n-B.nt;
}
m = B.mt-1;
while (k >= 0 && n < B.nt) {
next_n = n;
next_m = m;
next_k = k;
next_m--;
if (next_m == k-1) {
next_n += PLASMA_SIZE;
while (next_n >= B.nt && next_k >= 0) {
next_k--;
next_n = next_n-B.nt;
}
next_m = B.mt-1;
}
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
ldbm = BLKLDD(B, m);
if (m == k) {
side, trans,
tempkm, tempnn, tempkmin, ib,
A(k, k), ldak,
T(k, k), T.mb,
B(k, n), ldbk,
work, T.nb);
ss_cond_set(k, n, k);
}
else {
ss_cond_wait(m, n, k+1);
side, trans,
A.mb, tempnn, tempmm, tempnn, tempkmin, ib,
B(k, n), ldbk,
B(m, n), ldbm,
A(k, m), ldak,
T(k, m), T.mb,
work, ib);
ss_cond_set(m, n, k);
}
m = next_m;
n = next_n;
k = next_k;
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdormlq_quark ( PLASMA_enum  side,
PLASMA_enum  trans,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel application of Q using tile V - LQ factorization - dynamic scheduling

Definition at line 135 of file pdormlq.c.

References A, B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaNoTrans, plasma_context_struct::quark, QUARK_CORE_dormlq(), QUARK_CORE_dtsmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldbk, ldbm;
int tempmm, tempnn, tempkn, tempkm, tempkmin;
int ib, minMT, minM;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
if (A.m > A.n) {
minM = A.n;
minMT = A.nt;
} else {
minM = A.m;
minMT = A.mt;
}
if (side == PlasmaLeft ) {
if (trans == PlasmaNoTrans) {
/*
* PlasmaLeft / PlasmaNoTrans
*/
for (k = 0; k < minMT; k++) {
tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb;
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
B(k, n), ldbk);
}
for (m = k+1; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
B(k, n), ldbk,
B(m, n), ldbm,
A(k, m), ldak,
T(k, m), T.mb);
}
}
}
}
else {
/*
* PlasmaLeft / PlasmaTrans
*/
for (k = minMT-1; k >= 0; k--) {
tempkm = k == B.mt -1 ? B.m -k*B.mb : B.mb;
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
for (m = B.mt-1; m > k; m--) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.mb, tempnn, tempmm, tempnn, tempkmin, ib, T.nb,
B(k, n), ldbk,
B(m, n), ldbm,
A(k, m), ldak,
T(k, m), T.mb);
}
}
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
B(k, n), ldbk);
}
}
}
}
else {
if (trans == PlasmaNoTrans) {
/*
* PlasmaRight / PlasmaNoTrans
*/
for (k = minMT-1; k >= 0; k--) {
tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb;
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
for (n = B.nt-1; n > k; n--) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
B(m, k), ldbm,
B(m, n), ldbm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, tempkn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
B(m, k), ldbm);
}
}
}
else {
/*
* PlasmaRight / PlasmaTrans
*/
for (k = 0; k < minMT; k++) {
tempkn = k == B.nt -1 ? B.n -k*B.nb : B.nb;
tempkmin = k == minMT-1 ? minM-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, tempkn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
B(m, k), ldbm);
}
for (n = k+1; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, B.nb, tempmm, tempnn, tempkmin, ib, T.nb,
B(m, k), ldbm,
B(m, n), ldbm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdormlqrh_quark ( PLASMA_enum  side,
PLASMA_enum  trans,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel application of Q using tile V - LQ factorization (reduction Householder) - dynamic scheduling

Definition at line 27 of file pdormlqrh.c.

References A, B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaNoTrans, plasma_context_struct::quark, QUARK_CORE_dormlq(), QUARK_CORE_dtsmlq(), QUARK_CORE_dttmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int K, N, RD, lastRD;
int ldaN, ldak;
int ldbN, ldbm, ldbNRD;
int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
K = min(A.mt, A.nt);
if (side == PlasmaLeft ) {
if (trans == PlasmaNoTrans) {
/*
* PlasmaLeft / PlasmaNoTrans
*/
for (k = 0; k < K; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm,tempNn);
ldaN = BLKLDD(A, N);
ldbN = BLKLDD(B, N);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
tempNn, tempnn,
tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
B(N, n), ldbN);
}
for (m = N+1; m < min(N+BS, A.nt); m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.nb, tempnn, tempmm, tempnn,
tempkm, ib, T.nb,
B(N, n), ldbN,
B(m, n), ldbm,
A(k, m), ldak,
T(k, m), T.mb);
}
}
}
for (RD = BS; RD < A.nt-k; RD *= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
ldbN = BLKLDD(B, N );
ldbNRD = BLKLDD(B, N+RD);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.mb, tempnn, tempNRDn, tempnn,
tempkm, ib, T.nb,
B (N, n), ldbN,
B (N+RD, n), ldbNRD,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
}
} else {
/*
* PlasmaLeft / PlasmaTrans
*/
for (k = K-1; k >= 0; k--) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
lastRD = 0;
for (RD = BS; RD < A.nt-k; RD *= 2)
lastRD = RD;
for (RD = lastRD; RD >= BS; RD /= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
ldbN = BLKLDD(B, N );
ldbNRD = BLKLDD(B, N+RD);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.nb, tempnn, tempNRDn, tempnn,
tempkm, ib, T.nb,
B (N, n), ldbN,
B (N+RD, n), ldbNRD,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm,tempNn);
ldaN = BLKLDD(A, N);
ldbN = BLKLDD(B, N);
for (m = min(N+BS, A.nt)-1; m > N; m--) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
B.mb, tempnn, tempmm, tempnn,
tempkm, ib, T.nb,
B(N, n), ldbN,
B(m, n), ldbm,
A(k, m), ldak,
T(k, m), T.mb);
}
}
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
tempNn, tempnn,
tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
B(N, n), ldbN);
}
}
}
}
} else {
if (trans == PlasmaNoTrans) {
/*
* PlasmaRight / PlasmaNoTrans
*/
for (k = K-1; k >= 0; k--) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
lastRD = 0;
for (RD = BS; RD < A.nt-k; RD *= 2)
lastRD = RD;
for (RD = lastRD; RD >= BS; RD /= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
for (m = 0; m < B.mt; m++) {
ldbm = BLKLDD(B, m);
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
plasma->quark, &task_flags,
tempmm, B.nb, tempmm, tempNRDn,
tempkm, ib, T.nb,
B (m, N ), ldbm,
B (m, N+RD), ldbm,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm,tempNn);
for (n = min(N+BS, A.nt)-1; n > N; n--) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, B.nb, tempmm, tempnn,
tempkm, ib, T.nb,
B(m, N), ldbm,
B(m, n), ldbm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, tempNn,
tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
B(m, N), ldbm);
}
}
}
} else {
/*
* PlasmaRight / PlasmaTrans
*/
for (k = 0; k < K; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm,tempNn);
ldaN = BLKLDD(A, N);
for (m = 0; m < B.mt; m++) {
ldbm = BLKLDD(B, m);
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
plasma->quark, &task_flags,
tempmm, tempNn,
tempkmin, ib, T.nb,
A(k, N), ldaN,
T(k, N), T.mb,
B(m, N), ldbm);
}
for (n = N+1; n < min(N+BS, A.nt); n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, tempNn, tempmm, tempnn,
tempkm, ib, T.nb,
B(m, N), ldbm,
B(m, n), ldbm,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
for (RD = BS; RD < A.nt-k; RD *= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldbm = BLKLDD(B, m);
plasma->quark, &task_flags,
tempmm, B.nb, tempmm, tempNRDn,
tempkm, ib, T.nb,
B (m, N ), ldbm,
B (m, N+RD), ldbm,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
}
}
}
}

Here is the call graph for this function:

void plasma_pdormqr ( plasma_context_t plasma)

Parallel application of Q using tile V - QR factorization - static scheduling

Definition at line 26 of file pdormqr.c.

References A, B, BLKLDD, CORE_dormqr(), CORE_dtsmqr(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_ERR_NOT_SUPPORTED, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_7, PlasmaLeft, PlasmaTrans, side, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, T, and trans.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldbk, ldam, ldbm;
int tempkm, tempnn, tempkmin, tempmm;
int minMT, minM;
int ib = PLASMA_IB;
double *work;
plasma_unpack_args_7(side, trans, A, B, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
if (side != PlasmaLeft) {
return;
}
if (trans != PlasmaTrans) {
return;
}
work = (double*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
ss_init(B.mt, B.nt, -1);
if (A.m > A.n) {
minM = A.n;
minMT = A.nt;
} else {
minM = A.m;
minMT = A.mt;
}
k = 0;
while (n >= B.nt) {
k++;
n = n-B.nt;
}
m = k;
while (k < minMT && n < B.nt) {
next_n = n;
next_m = m;
next_k = k;
next_m++;
if (next_m == A.mt) {
next_n += PLASMA_SIZE;
while (next_n >= B.nt && next_k < minMT) {
next_k++;
next_n = next_n-B.nt;
}
next_m = next_k;
}
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
if (m == k) {
ss_cond_wait(k, n, k-1);
side, trans,
tempkm, tempnn, tempkmin, ib,
A(k, k), ldak,
T(k, k), T.mb,
B(k, n), ldbk,
work, T.nb);
ss_cond_set(k, n, k);
}
else {
ss_cond_wait(m, n, k-1);
side, trans,
A.mb, tempnn, tempmm, tempnn, tempkmin, ib,
B(k, n), ldbk,
B(m, n), ldbm,
A(m, k), ldam,
T(m, k), T.mb,
work, ib);
ss_cond_set(m, n, k);
}
n = next_n;
m = next_m;
k = next_k;
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdormqr_quark ( PLASMA_enum  side,
PLASMA_enum  trans,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel application of Q using tile V - QR factorization - dynamic scheduling

Definition at line 137 of file pdormqr.c.

References A, B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaLeft, PlasmaTrans, plasma_context_struct::quark, QUARK_CORE_dormqr(), QUARK_CORE_dtsmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldbk, ldam, ldan, ldbm;
int tempkm, tempnn, tempkmin, tempmm, tempkn;
int ib, minMT, minM;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
if (A.m > A.n) {
minM = A.n;
minMT = A.nt;
} else {
minM = A.m;
minMT = A.mt;
}
/*
* PlasmaLeft / PlasmaTrans
*/
if (side == PlasmaLeft ) {
if (trans == PlasmaTrans) {
for (k = 0; k < minMT; k++) {
tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
tempkmin = k == minMT-1 ? minM-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkmin, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
B(k, n), ldbk);
}
for (m = k+1; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.