PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
compute_c.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define plasma_cdesc_alloc(descA, mb, nb, lm, ln, i, j, m, n, free)
#define plasma_cooplap2tile(descA, A, mb, nb, lm, ln, i, j, m, n, free)
#define plasma_ciplap2tile(descA, A, mb, nb, lm, ln, i, j, m, n)
#define plasma_cooptile2lap(descA, A, mb, nb, lm, ln)
#define plasma_ciptile2lap(descA, A, mb, nb, lm, ln)

Functions

void plasma_pcgeadd (plasma_context_t *plasma)
void plasma_pcgelqf (plasma_context_t *plasma)
void plasma_pcgemm (plasma_context_t *plasma)
void plasma_pcgeqrf (plasma_context_t *plasma)
void plasma_pcgerbb (plasma_context_t *plasma)
void plasma_pcgetmi2 (plasma_context_t *plasma)
void plasma_pcgetrf_incpiv (plasma_context_t *plasma)
void plasma_pclacpy (plasma_context_t *plasma)
void plasma_pclag2z (plasma_context_t *plasma)
void plasma_pclange (plasma_context_t *plasma)
void plasma_pclansy (plasma_context_t *plasma)
void plasma_pcpack (plasma_context_t *plasma)
void plasma_pcplghe (plasma_context_t *plasma)
void plasma_pcplgsy (plasma_context_t *plasma)
void plasma_pcplrnt (plasma_context_t *plasma)
void plasma_pcpotrf (plasma_context_t *plasma)
void plasma_pcshift (plasma_context_t *plasma)
void plasma_pcsymm (plasma_context_t *plasma)
void plasma_pcsyrk (plasma_context_t *plasma)
void plasma_pcsyr2k (plasma_context_t *plasma)
void plasma_pctrmm (plasma_context_t *plasma)
void plasma_pctrsm (plasma_context_t *plasma)
void plasma_pctrsmpl (plasma_context_t *plasma)
void plasma_pctrsmrv (plasma_context_t *plasma)
void plasma_pcunglq (plasma_context_t *plasma)
void plasma_pcungqr (plasma_context_t *plasma)
void plasma_pcungqrrh (plasma_context_t *plasma)
void plasma_pcunmlq (plasma_context_t *plasma)
void plasma_pcunmqr (plasma_context_t *plasma)
void plasma_pcunpack (plasma_context_t *plasma)
int plasma_cshift (plasma_context_t *plasma, int m, int n, PLASMA_Complex32_t *A, int nprob, int me, int ne, int L, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgeadd_quark (PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcbarrier_tl2pnl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcbarrier_pnl2tl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcbarrier_tl2row_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcbarrier_row2tl_quark (PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgelqf_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgelqfrh_quark (PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgemm_quark (PLASMA_enum transA, PLASMA_enum transB, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgeqrf_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgeqrfrh_quark (PLASMA_desc A, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgerbh_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgerbb_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgerbbrh_quark (PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgetmi2_quark (PLASMA_enum idep, PLASMA_enum odep, PLASMA_enum storev, int m, int n, int mb, int nb, PLASMA_Complex32_t *A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgetrf_incpiv_quark (PLASMA_desc A, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgetrf_reclap_quark (PLASMA_desc A, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgetrf_rectil_quark (PLASMA_desc A, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pchegst_quark (PLASMA_enum itype, PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcherbt_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcgbrdb_quark (PLASMA_enum uplo, PLASMA_desc A, float *D, float *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pchbrdt_quark (PLASMA_enum uplo, PLASMA_desc A, float *D, float *E, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclacpy_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclag2z_quark (PLASMA_desc A, PLASMA_desc SB, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclange_quark (PLASMA_enum norm, PLASMA_desc A, float *work, float *result, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclansy_quark (PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc A, float *work, float *result, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclaset_quark (PLASMA_enum uplo, PLASMA_Complex32_t alpha, PLASMA_Complex32_t beta, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclaset2_quark (PLASMA_enum uplo, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclaswp_quark (PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclaswpc_quark (PLASMA_desc B, int *IPIV, int inc, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pclauum_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcplghe_quark (float bump, PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcplgsy_quark (PLASMA_Complex32_t bump, PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcplrnt_quark (PLASMA_desc A, unsigned long long int seed, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcpotrf_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcshift_quark (int, int, int, PLASMA_Complex32_t *, int *, int, int, PLASMA_sequence *, PLASMA_request *)
void plasma_pcsymm_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcsyrk_quark (PLASMA_enum uplo, PLASMA_enum trans, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcsyr2k_quark (PLASMA_enum uplo, PLASMA_enum trans, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_Complex32_t beta, PLASMA_desc C, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pctrmm_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pctrsm_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc B, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pctrsmpl_quark (PLASMA_desc A, PLASMA_desc B, PLASMA_desc L, int *IPIV, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pctrsmrv_quark (PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum transA, PLASMA_enum diag, PLASMA_Complex32_t alpha, PLASMA_desc A, PLASMA_desc W, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pctrtri_quark (PLASMA_enum uplo, PLASMA_enum diag, PLASMA_desc A, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcungbr_quark (PLASMA_enum side, PLASMA_desc A, PLASMA_desc O, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcungbrrh_quark (PLASMA_enum side, PLASMA_desc A, PLASMA_desc O, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcungqr_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcungqrrh_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunglq_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunglqrh_quark (PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcungtr_quark (PLASMA_enum uplo, PLASMA_desc A, PLASMA_desc Q, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunmqr_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunmqrrh_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunmlq_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pcunmlqrh_quark (PLASMA_enum side, PLASMA_enum trans, PLASMA_desc A, PLASMA_desc B, PLASMA_desc T, int BS, PLASMA_sequence *sequence, PLASMA_request *request)

Detailed Description

PLASMA auxiliary routines PLASMA is a software package provided by Univ. of Tennessee, Univ. of California Berkeley and Univ. of Colorado Denver

Version:
2.4.5
Author:
Jakub Kurzak
Mathieu Faverge
Date:
2010-11-15 c Tue Nov 22 14:35:45 2011

Definition in file compute_c.h.


Macro Definition Documentation

#define plasma_cdesc_alloc (   descA,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
  n,
  free 
)
Value:
descA = plasma_desc_init( \
PlasmaComplexFloat, (mb), (nb), ((mb)*(nb)), \
(m), (n), (i), (j), (m), (n)); \
if ( plasma_desc_mat_alloc( &(descA) ) ) { \
plasma_error( __func__, "plasma_shared_alloc() failed"); \
{free;}; \
}

Macro for matrix conversion / Lapack interface

Definition at line 20 of file compute_c.h.

#define plasma_ciplap2tile (   descA,
  A,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
 
)
Value:
descA = plasma_desc_init( \
PlasmaComplexFloat, (mb), (nb), ((mb)*(nb)), \
(lm), (ln), (i), (j), (m), (n)); \
descA.mat = A; \
PLASMA_cgecfi_Async((lm), (ln), (A), PlasmaCM, (mb), (nb), \
PlasmaCCRB, (mb), (nb), sequence, &request);

Definition at line 47 of file compute_c.h.

#define plasma_ciptile2lap (   descA,
  A,
  mb,
  nb,
  lm,
  ln 
)
Value:
PLASMA_cgecfi_Async((lm), (ln), (A), PlasmaCCRB, (mb), (nb), \
PlasmaCM, (mb), (nb), sequence, &request);

Definition at line 65 of file compute_c.h.

#define plasma_cooplap2tile (   descA,
  A,
  mb,
  nb,
  lm,
  ln,
  i,
  j,
  m,
  n,
  free 
)
Value:
descA = plasma_desc_init( \
PlasmaComplexFloat, (mb), (nb), ((mb)*(nb)), \
(lm), (ln), (i), (j), (m), (n)); \
if ( plasma_desc_mat_alloc( &(descA) ) ) { \
plasma_error( __func__, "plasma_shared_alloc() failed"); \
{free;}; \
} \
plasma_parallel_call_5( \
int, (lm), \
PLASMA_desc, (descA), \
PLASMA_sequence*, sequence, \
PLASMA_request*, &request);

Definition at line 30 of file compute_c.h.

#define plasma_cooptile2lap (   descA,
  A,
  mb,
  nb,
  lm,
  ln 
)
Value:

Definition at line 57 of file compute_c.h.


Function Documentation

int plasma_cshift ( plasma_context_t plasma,
int  m,
int  n,
PLASMA_Complex32_t A,
int  nprob,
int  me,
int  ne,
int  L,
PLASMA_sequence sequence,
PLASMA_request request 
)

Declarations of internal sequential functions


plasma_cgetmi2 Implementation of inplace transposition based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. This algorithm shift some cycles to transpose the matrix.

Parameters:
[in]mNumber of rows of matrix A
[in]nNumber of columns of matrix A
[in,out]AMatrix of size L*m*n
[in]nprobNumber of parallel and independant problems
[in]meNumber of rows of the problem
[in]neNumber of columns in the problem
[in]LSize of chunk to use for transformation

Definition at line 60 of file pcshift.c.

References GKK_BalanceLoad(), GKK_getLeaderNbr(), L, minloc(), plasma_dynamic_call_9, PLASMA_ERR_ILLEGAL_VALUE, plasma_error(), PLASMA_GRPSIZE, plasma_pcshift(), plasma_request_fail(), PLASMA_SCHEDULING, plasma_shared_alloc(), plasma_shared_free(), PLASMA_SIZE, plasma_static_call_9, PLASMA_STATIC_SCHEDULING, PLASMA_SUCCESS, and PlasmaInteger.

{
int *leaders = NULL;
int ngrp, thrdbypb, thrdtot, nleaders;
/* Check Plasma context */
thrdtot = PLASMA_SIZE;
thrdbypb = PLASMA_GRPSIZE;
ngrp = thrdtot/thrdbypb;
/* check input */
if( (nprob * me * ne * L) != (m * n) ) {
plasma_error(__func__, "problem size does not match matrix size");
/*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( thrdbypb > thrdtot ) {
plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( (thrdtot % thrdbypb) != 0 ) {
plasma_error(__func__, "number of thread per problem must divide the total number of thread");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
/* quick return */
if( (me < 2) || (ne < 2) || (nprob < 1) ) {
}
GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
nleaders *= 3;
int *Tp = NULL;
int i, ipb;
int owner;
Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
ipb = 0;
/* First part with coarse parallelism */
if (nprob > ngrp) {
ipb = (nprob / ngrp)*ngrp;
/* loop over leader */
if (thrdbypb > 1) {
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdbypb, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=0; i< (nprob/ngrp); i++) {
int, me,
int, ne,
int, L,
PLASMA_Complex32_t*, &(A[i*ngrp*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdbypb,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
/* Second part with fine parallelism */
if (ipb < nprob) {
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
if (thrdtot > 1) {
/* loop over leader */
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdtot, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=ipb; i<nprob; i++) {
int, me,
int, ne,
int, L,
PLASMA_Complex32_t*, &(A[i*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdtot,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
plasma_shared_free(plasma, Tp);
}
/* Dynamic scheduling */
else {
int, me,
int, ne,
int, L,
int *, leaders,
int, nleaders,
int, nprob,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
free(leaders);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcbarrier_pnl2tl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from panels to tiles

Definition at line 61 of file pcbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), INOUT, INPUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (n = 0; n < A.nt; n++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(0, n), INOUT,
0);
for (m = 0; m < A.mt; m++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(0, n), INPUT,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, n), INOUT,
0);
}
}
}

Here is the call graph for this function:

void plasma_pcbarrier_row2tl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from panels to tiles

Definition at line 128 of file pcbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), INOUT, INPUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, 0), INOUT,
0);
for (n = 0; n < A.nt; n++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, 0), INPUT,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, n), INOUT,
0);
}
}
}

Here is the call graph for this function:

void plasma_pcbarrier_tl2pnl_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from tiles to panels

Definition at line 25 of file pcbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), GATHERV, INOUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (n = 0; n < A.nt; n++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(0, n), INOUT,
0);
for (m = 0; m < A.mt; m++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(0, n), INOUT | GATHERV,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, n), INOUT,
0);
}
/* Protection to next GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(0, n), INOUT,
0);
}
}

Here is the call graph for this function:

void plasma_pcbarrier_tl2row_quark ( PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Barrier from tiles to panels

Definition at line 92 of file pcbarrier.c.

References CORE_foo2_quark(), CORE_foo_quark(), GATHERV, INOUT, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++)
{
/* Protection from previous GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, 0), INOUT,
0);
for (n = 0; n < A.nt; n++)
{
QUARK_Insert_Task(plasma->quark, CORE_foo2_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, 0), INOUT | GATHERV,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, n), INOUT,
0);
}
/* Protection to next GATHERV */
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(PLASMA_Complex32_t)*A.mb*A.nb, A(m, 0), INOUT,
0);
}
}

Here is the call graph for this function:

void plasma_pcgbrdb_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
float *  D,
float *  E,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel Reduction from BAND Bidiagonal to the final condensed form - dynamic scheduler

Definition at line 26 of file pcgbrdb.c.

References A, C, cabsf(), DEP, plasma_desc_t::dtyp, plasma_desc_t::lm, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::n, plasma_context_self(), plasma_element_size(), plasma_sequence_flush(), plasma_shared_alloc(), plasma_shared_free(), PLASMA_SUCCESS, PlasmaComplexFloat, PlasmaInteger, PlasmaLower, plasma_context_struct::quark, QUARK_Barrier(), QUARK_CORE_cbrdalg(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, TASK_SEQUENCE, TAU, and V.

{
#ifdef COMPLEX
static float dzero = (float) 0.0;
float absztmp;
#endif
static PLASMA_Complex32_t zzero = (PLASMA_Complex32_t) 0.0;
int M, N, NB, MINMN, INgrsiz, INthgrsiz, BAND;
int myid, grsiz, shift=3, stt, st, ed, stind, edind;
int blklastind, colpt, PCOL, ACOL, MCOL;
int stepercol,mylastid,grnb,grid;
int *DEP,*MAXID;
int i, j, m;
int thgrsiz, thgrnb, thgrid, thed;
size_t eltsize = plasma_element_size(A.dtyp);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
M = A.m;
N = A.n;
NB = A.mb;
MINMN = min(M,N);
/* Quick return */
if ( MINMN == 0 ){
return;
}
if ( NB == 0 ) {
memset(D, 0, MINMN *sizeof(float));
memset(E, 0, (MINMN-1)*sizeof(float));
#ifdef COMPLEX
for (i=0; i<MINMN; i++)
D[i] = cabsf(*A(i,i));
#else
for (i=0; i<MINMN; i++)
D[i] = *A(i,i);
#endif
return;
}
/*
* Barrier is used because the bulge have to wait until
* the reduction to band has been finish.
* otherwise, I can remove this BARRIER when I integrate
* the function dependencies link inside the reduction to
* band. Keep in mind the case when NB=1, where no bulge-chasing.
*/
/***************************************************************/
QUARK_Barrier(plasma->quark);
/***************************************************************/
/*
* Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
* Make diagonal and superdiagonal elements real, storing them in
* D and E. if PlasmaLower, first transform lower bidiagonal form
* to upper bidiagonal by applying plane rotations/ Householder
* from the left, overwriting superdiagonal elements then make
* elements real of the resulting upper Bidiagonal. if PlasmaUpper
* then make its elements real. For Q, PT: ZSCAL should be done
* in case of WANTQ.
*/
if ( NB == 1 ) {
memset(D, 0, MINMN*sizeof(float));
memset(E, 0, (MINMN-1)*sizeof(float));
for (i=0; i<(MINMN-1); i++)
{
/* generate Householder to annihilate a(i+1,i) and create a(i,i+1) */
V = *A((i+1), i);
*A((i+1), i) = zzero;
LAPACKE_clarfg_work( 2, A(i, i), &V, 1, &TAU);
/* apply Left*/
TAU = conjf(TAU);
ztmp = TAU*V;
V = conjf(V);
*A(i, i+1) = - V * TAU * (*A(i+1, i+1));
*A(i+1, i+1) = *(A(i+1, i+1)) * (zone - V * ztmp);
}
}
/* PlasmaLower or PlasmaUpper, both are now upper */
/* Make diagonal and superdiagonal elements real,
* storing them in D and E
*/
#ifdef COMPLEX
ztmp = zone;
for (i=0; i<MINMN; i++)
{
ztmp = *A(i, i) * conjf(ztmp);
absztmp = cabsf(ztmp);
D[i] = absztmp; /* diag value */
if(absztmp != dzero)
ztmp = (PLASMA_Complex32_t) (ztmp / absztmp);
else
ztmp = zone;
if(i<(MINMN-1)) {
ztmp = *A(i, (i+1)) * conjf(ztmp);
absztmp = cabsf(ztmp);
E[i] = absztmp; /* upper off-diag value */
if(absztmp != dzero)
ztmp = (PLASMA_Complex32_t) (ztmp / absztmp);
else
ztmp = zone;
}
}
#else
for (i=0; i < MINMN-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
#endif
return;
}
/*
* Case MINMN<NB ==> matrix is very small and better to call lapack ZGETRD.
*
* Use fact that one row of block is stored the same way than in LAPACK
* Doesn't work if M > NB because of tile storage
*/
if ( MINMN <= 0 )
{
PLASMA_Complex32_t *work, *taup, *tauq;
int info, ldwork = N*N;
info = LAPACKE_cgebrd_work(LAPACK_COL_MAJOR, M, N,
A(0,0), A.lm, D, E, taup, tauq, work, ldwork);
plasma_shared_free(plasma, (void*) work);
plasma_shared_free(plasma, (void*) taup);
plasma_shared_free(plasma, (void*) tauq);
if( info == 0 )
sequence->status = PLASMA_SUCCESS;
else
plasma_sequence_flush(plasma->quark, sequence, request, info);
return;
}
/* General case NB > 1 && N > NB */
DEP = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
MAXID = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
memset(MAXID,0,(MINMN+1)*sizeof(int));
/***************************************************************************
* START BULGE CHASING CODE
**************************************************************************/
/*
* Initialisation of local parameter. those parameter should be
* input or tuned parameter.
*/
INgrsiz = 1;
if( NB > 160 ) {
INgrsiz = 2;
}
else if( NB > 100 ) {
if( MINMN < 5000 )
INgrsiz = 2;
else
INgrsiz = 4;
} else {
INgrsiz = 6;
}
INthgrsiz = MINMN;
BAND = 0;
grsiz = INgrsiz;
thgrsiz = INthgrsiz;
if( grsiz == 0 ) grsiz = 6;
if( thgrsiz == 0 ) thgrsiz = MINMN;
i = shift/grsiz;
stepercol = i*grsiz == shift ? i:i+1;
i = (MINMN-2)/thgrsiz;
thgrnb = i*thgrsiz == (MINMN-2) ? i:i+1;
for (thgrid = 1; thgrid<=thgrnb; thgrid++){
stt = (thgrid-1)*thgrsiz+1;
thed = min( (stt + thgrsiz -1), (MINMN-2));
for (i = stt; i <= MINMN-2; i++){
ed=min(i,thed);
if(stt>ed)break;
for (m = 1; m <=stepercol; m++){
st=stt;
for (j = st; j <=ed; j++){
/* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */
/* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */
/* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
mylastid = myid+grsiz-1;
PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group*/
MAXID[j] = myid;
PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
grnb = PCOL/grsiz;
grid = grnb*grsiz == PCOL ? grnb:grnb+1;
PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column.*/
ACOL = myid-grsiz;
if(myid==1)ACOL=0;
MCOL = myid;
plasma->quark, &task_flags,
uplo, MINMN, NB,
&A, C, S, i, j, m, grsiz, BAND,
DEP(PCOL), DEP(ACOL), DEP(MCOL) );
if(mylastid%2 ==0){
blklastind = (mylastid/2)*NB+1+j-1;
}else{
colpt = ((mylastid+1)/2)*NB + 1 +j -1 ;
stind = colpt-NB+1;
edind = min(colpt,MINMN);
if( (stind>=edind-1) && (edind==MINMN) )
blklastind=MINMN;
else
blklastind=0;
}
if(blklastind >= (MINMN-1)) stt=stt+1;
} /* END for j=st:ed */
} /* END for m=1:stepercol */
} /* END for i=1:MINMN-2 */
} /* END for thgrid=1:thgrnb */
/*
* Barrier used only for now, to be sure that everything
* is done before copying the D and E and free workspace.
* this will be removed later when D and E are directly filled
* during the bulge process.
*/
QUARK_Barrier(plasma->quark);
plasma_shared_free(plasma, (void*) DEP);
plasma_shared_free(plasma, (void*) MAXID);
plasma_shared_free(plasma, (void*) C);
plasma_shared_free(plasma, (void*) S);
/*
* STORE THE RESULTING diagonal/off-diagonal in D AND E
*/
memset(D, 0, MINMN*sizeof(float));
memset(E, 0, (MINMN-1)*sizeof(float));
/*
* If PlasmaLower, first transform lower bidiagonal form
* to upper bidiagonal by applying plane rotations/ Householder
* from the left, overwriting superdiagonal elements then make
* elements real of the resulting upper Bidiagonal. if PlasmaUpper
* then make its elements real.
* For Q, PT: ZSCAL should be done in case of WANTQ.
*/
for (i=0; i<(MINMN-1); i++)
{
/* generate Householder to annihilate a(i+1,i) and create a(i,i+1)*/
V = *A((i+1), i);
*A((i+1), i) = zzero;
LAPACKE_clarfg_work( 2, A(i, i), &V, 1, &TAU);
/* apply Left */
TAU = conjf(TAU);
ztmp = TAU*V;
V = conjf(V);
*A(i, (i+1)) = - V * TAU * (*A((i+1), (i+1)));
*A((i+1), (i+1)) = (*A((i+1), (i+1))) * (zone - V * ztmp);
}
}
/* PlasmaLower or PlasmaUpper, both are upper, now*/
/* Make diagonal and superdiagonal elements real,
* storing them in D and E
*/
/* In complex case, the element off diagonal element are
* not necessary real and we have to make off-diagonal
* elements real and copy them to E.
* When using HouseHolder elimination,
* the ZLARFG give us a real as output so, all the
* diagonal/off-diagonal element except the last one are already
* real and thus we need only to take the abs of the last
* one.
* */
#ifdef COMPLEX
ztmp =zone;
for (i=0; i < MINMN-1; i++) {
D[i] = crealf( *A(i, i) );
/*
* Alternative for Householder case, all diag/superdiag
* are real except the last diag and superdiag, where we
* have to take the abs
*/
if(i<(MINMN-2))
E[i] = crealf(*A(i, i+1));
else
E[i] = cabsf( *A(i, i+1)); /* last upper value is complex */
}
D[i] = cabsf( *A(i, i) );
#else
for (i=0; i < MINMN-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
#endif
} /* END FUNCTION */

Here is the call graph for this function:

void plasma_pcgeadd ( plasma_context_t plasma)

Declarations of parallel functions (static scheduling) - alphabetical order

Definition at line 23 of file pcgeadd.c.

References A, B, BLKLDD, CORE_cgeadd(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, and plasma_sequence_t::status.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int next_m;
int next_n;
int ldam, ldbm;
plasma_unpack_args_5(alpha, A, B, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt;
}
X = m == A.mt-1 ? A.m-A.mb*m : A.nb;
Y = n == A.nt-1 ? A.n-A.nb*n : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
CORE_cgeadd(X, Y, alpha, A(m, n), ldam, B(m, n), ldbm);
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

void plasma_pcgeadd_quark ( PLASMA_Complex32_t  alpha,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_sequence sequence,
PLASMA_request request 
)

Declarations of parallel functions (dynamic scheduling) - alphabetical order

Definition at line 72 of file pcgeadd.c.

References B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_cgeadd(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
for (n = 0; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
alpha, A(m, n), ldam,
B(m, n), ldbm);
}
}
}

Here is the call graph for this function:

void plasma_pcgelqf ( plasma_context_t plasma)

Parallel tile LQ factorization - static scheduling

Definition at line 24 of file pcgelqf.c.

References A, BLKLDD, CORE_cgelqt(), CORE_ctslqt(), CORE_ctsmlq(), CORE_cunmlq(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaConjTrans, PlasmaRight, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib = PLASMA_IB;
PLASMA_Complex32_t *work, *tau;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (m >= A.mt) {
k++;
m = m-A.mt+k;
}
n = k;
while (k < min(A.mt, A.nt) && m < A.mt) {
next_m = m;
next_n = n;
next_k = k;
next_n++;
if (next_n == A.nt) {
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_k < min(A.nt, A.mt)) {
next_k++;
next_m = next_m-A.mt+next_k;
}
next_n = next_k;
}
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (m == k) {
if (n == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
tau, work);
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(k, n, k-1);
tempkm, tempnn, ib,
A(k, k), ldak,
A(k, n), ldak,
T(k, n), T.mb,
tau, work);
ss_cond_set(k, n, k);
}
}
else {
if (n == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(m, k, k-1);
tempmm, tempkn, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
A(m, k), ldam,
work, T.nb);
}
else {
ss_cond_wait(k, n, k);
ss_cond_wait(m, n, k-1);
tempmm, A.nb, tempmm, tempnn, A.nb, ib,
A(m, k), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb,
work, T.nb);
ss_cond_set(m, n, k);
}
}
m = next_m;
n = next_n;
k = next_k;
}
plasma_private_free(plasma, work);
plasma_private_free(plasma, tau);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgelqf_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LQ factorization - dynamic scheduling

Definition at line 137 of file pcgelqf.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaConjTrans, PlasmaRight, plasma_context_struct::quark, QUARK_CORE_cgelqt(), QUARK_CORE_ctslqt(), QUARK_CORE_ctsmlq(), QUARK_CORE_cunmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
A(m, k), ldam);
}
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, ib, T.nb,
A(k, k), ldak,
A(k, n), ldak,
T(k, n), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempnn, A.mb, ib, T.nb,
A(m, k), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgelqfrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LQ factorization (reduction Householder) - dynamic scheduling

Definition at line 25 of file pcgelqfrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaConjTrans, PlasmaRight, plasma_context_struct::quark, QUARK_CORE_cgelqt(), QUARK_CORE_ctslqt(), QUARK_CORE_ctsmlq(), QUARK_CORE_cttlqt(), QUARK_CORE_cttmlq(), QUARK_CORE_cunmlq(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int N, RD;
int ldak, ldam;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
for (N = k; N < A.nt; N += BS) {
tempNn = N == A.nt-1 ? A.n-N*A.nb : A.nb;
tempkmin = min(tempkm, tempNn);
plasma->quark, &task_flags,
tempkm, tempNn, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempNn, tempkmin, ib, T.nb,
A(k, N), ldak,
T(k, N), T.mb,
A(m, N), ldam);
}
for (n = N+1; n < min(N+BS, A.nt); n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, ib, T.nb,
A(k, N), ldak,
A(k, n), ldak,
T(k, n), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempnn, tempkm, ib, T.nb,
A(m, N), ldam,
A(m, n), ldam,
A(k, n), ldak,
T(k, n), T.mb);
}
}
}
for (RD = BS; RD < A.nt-k; RD *= 2) {
for (N = k; N+RD < A.nt; N += 2*RD) {
tempNRDn = N+RD == A.nt-1 ? A.n-(N+RD)*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempNRDn, ib, T.nb,
A (k, N ), ldak,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m );
plasma->quark, &task_flags,
tempmm, A.nb, tempmm, tempNRDn, tempkm, ib, T.nb,
A (m, N ), ldam,
A (m, N+RD), ldam,
A (k, N+RD), ldak,
T2(k, N+RD), T.mb);
}
}
}
}
}

Here is the call graph for this function:

void plasma_pcgemm ( plasma_context_t plasma)

Parallel tile matrix-matrix multiplication - static scheduling

Definition at line 24 of file pcgemm.c.

References A, B, BLKLDD, C, CORE_cgemm(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_9, PlasmaNoTrans, and plasma_sequence_t::status.

{
PLASMA_enum transA;
PLASMA_enum transB;
PLASMA_sequence *sequence;
PLASMA_request *request;
int K, X, Y;
int k, m, n;
int next_m;
int next_n;
int ldam, ldak, ldbn, ldbk, ldcm;
plasma_unpack_args_9(transA, transB, alpha, A, B, beta, C, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= C.mt && n < C.nt) {
n++;
m = m-C.mt;
}
while (n < C.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= C.mt && next_n < C.nt) {
next_n++;
next_m = next_m - C.mt;
}
X = m == C.mt-1 ? C.m - m*C.mb : C.mb;
Y = n == C.nt-1 ? C.n - n*C.nb : C.nb;
ldcm = BLKLDD(C, m);
/*
* A: PlasmaNoTrans / B: PlasmaNoTrans
*/
if (transA == PlasmaNoTrans) {
ldam = BLKLDD(A, m);
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.nt; k++) {
K = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(m, k), ldam,
B(k, n), ldbk,
zbeta, C(m, n), ldcm);
}
}
/*
* A: PlasmaNoTrans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.nt; k++) {
K = k == A.nt-1 ? A.n-k*A.nb : A.nb;
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(m, k), ldam,
B(n, k), ldbn,
zbeta, C(m, n), ldcm);
}
}
}
/*
* A: Plasma[Conj]Trans / B: PlasmaNoTrans
*/
else {
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.mt; k++) {
K = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(k, m), ldak,
B(k, n), ldbk,
zbeta, C(m, n), ldcm);
}
}
/*
* A: Plasma[Conj]Trans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.mt; k++) {
K = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
transA, transB,
X, Y, K,
alpha, A(k, m), ldak,
B(n, k), ldbn,
zbeta, C(m, n), ldcm);
}
}
}
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgemm_quark ( PLASMA_enum  transA,
PLASMA_enum  transB,
PLASMA_Complex32_t  alpha,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_Complex32_t  beta,
PLASMA_desc  C,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile matrix-matrix multiplication - dynamic scheduling

Definition at line 149 of file pcgemm.c.

References B, BLKLDD, C, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaNoTrans, plasma_context_struct::quark, QUARK_CORE_cgemm(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n, k;
int ldam, ldak, ldbn, ldbk, ldcm;
int tempmm, tempnn, tempkn, tempkm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < C.mt; m++) {
tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
ldcm = BLKLDD(C, m);
for (n = 0; n < C.nt; n++) {
tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
/*
* A: PlasmaNoTrans / B: PlasmaNoTrans
*/
if (transA == PlasmaNoTrans) {
ldam = BLKLDD(A, m);
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.nt; k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkn, A.mb,
alpha, A(m, k), ldam, /* lda * Z */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
/*
* A: PlasmaNoTrans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.nt; k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkn, A.mb,
alpha, A(m, k), ldam, /* lda * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
}
/*
* A: Plasma[Conj]Trans / B: PlasmaNoTrans
*/
else {
if (transB == PlasmaNoTrans) {
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkm, A.mb,
alpha, A(k, m), ldak, /* lda * X */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
/*
* A: Plasma[Conj]Trans / B: Plasma[Conj]Trans
*/
else {
ldbn = BLKLDD(B, n);
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
plasma->quark, &task_flags,
transA, transB,
tempmm, tempnn, tempkm, A.mb,
alpha, A(k, m), ldak, /* lda * X */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgeqrf ( plasma_context_t plasma)

Parallel tile QR factorization - static scheduling

Definition at line 24 of file pcgeqrf.c.

References A, BLKLDD, CORE_cgeqrt(), CORE_ctsmqr(), CORE_ctsqrt(), CORE_cunmqr(), plasma_desc_t::dtyp, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaConjTrans, PlasmaLeft, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int tempkm, tempkn, tempnn, tempmm;
int ib = PLASMA_IB;
PLASMA_Complex32_t *work, *tau;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*T.nb, T.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (n >= A.nt) {
k++;
n = n-A.nt+k;
}
m = k;
while (k < min(A.mt, A.nt) && n < A.nt) {
next_n = n;
next_m = m;
next_k = k;
next_m++;
if (next_m == A.mt) {
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
next_k++;
next_n = next_n-A.nt+next_k;
}
next_m = next_k;
}
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (n == k) {
if (m == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
T(k, k), T.mb,
tau, work);
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(m, k, k-1);
tempmm, tempkn, ib,
A(k, k), ldak,
A(m, k), ldam,
T(m, k), T.mb,
tau, work);
ss_cond_set(m, k, k);
}
}
else {
if (m == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(k, n, k-1);
tempkm, tempnn, tempkm, ib,
A(k, k), ldak,
T(k, k), T.mb,
A(k, n), ldak,
work, T.nb);
}
else {
ss_cond_wait(m, k, k);
ss_cond_wait(m, n, k-1);
A.nb, tempnn, tempmm, tempnn, A.nb, ib,
A(k, n), ldak,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb,
work, ib);
ss_cond_set(m, n, k);
}
}
n = next_n;
m = next_m;
k = next_k;
}
plasma_private_free(plasma, work);
plasma_private_free(plasma, tau);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgeqrf_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile QR factorization - dynamic scheduling

Definition at line 137 of file pcgeqrf.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaConjTrans, PlasmaLeft, plasma_context_struct::quark, QUARK_CORE_cgeqrt(), QUARK_CORE_ctsmqr(), QUARK_CORE_ctsqrt(), QUARK_CORE_cunmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempnn, tempmm;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkm, ib, T.nb,
A(k, k), ldak,
T(k, k), T.mb,
A(k, n), ldak);
}
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, T.nb,
A(k, k), ldak,
A(m, k), ldam,
T(m, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.mb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
A(k, n), ldak,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgeqrfrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
int  BS,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile QR factorization (reduction Householder) - dynamic scheduling

Definition at line 25 of file pcgeqrfrh.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaConjTrans, PlasmaLeft, plasma_context_struct::quark, QUARK_CORE_cgeqrt(), QUARK_CORE_ctsmqr(), QUARK_CORE_ctsqrt(), QUARK_CORE_cttmqr(), QUARK_CORE_cttqrt(), QUARK_CORE_cunmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, T2, and TASK_SEQUENCE.

{
int k, m, n;
int M, RD;
int ldaM, ldam, ldaMRD;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
for (M = k; M < A.mt; M += BS) {
tempMm = M == A.mt-1 ? A.m-M*A.mb : A.mb;
tempkmin = min(tempMm, tempkn);
ldaM = BLKLDD(A, M);
plasma->quark, &task_flags,
tempMm, tempkn, ib, T.nb,
A(M, k), ldaM,
T(M, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempMm, tempnn, tempkmin, ib, T.nb,
A(M, k), ldaM,
T(M, k), T.mb,
A(M, n), ldaM);
}
for (m = M+1; m < min(M+BS, A.mt); m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, T.nb,
A(M, k), ldaM,
A(m, k), ldam,
T(m, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempmm, tempnn, A.nb, ib, T.nb,
A(M, n), ldaM,
A(m, n), ldam,
A(m, k), ldam,
T(m, k), T.mb);
}
}
}
for (RD = BS; RD < A.mt-k; RD *= 2) {
for (M = k; M+RD < A.mt; M += 2*RD) {
tempMRDm = M+RD == A.mt-1 ? A.m-(M+RD)*A.mb : A.mb;
ldaM = BLKLDD(A, M );
ldaMRD = BLKLDD(A, M+RD);
plasma->quark, &task_flags,
tempMRDm, tempkn, ib, T.nb,
A (M , k), ldaM,
A (M+RD, k), ldaMRD,
T2(M+RD, k), T.mb);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempMRDm, tempnn, A.nb, ib, T.nb,
A (M, n), ldaM,
A (M+RD, n), ldaMRD,
A (M+RD, k), ldaMRD,
T2(M+RD, k), T.mb);
}
}
}
}
}

Here is the call graph for this function:

void plasma_pcgerbb ( plasma_context_t plasma)

Parallel tile BAND Bidiagonal Reduction - dynamic scheduler Could be optimized by using the algorithms from Trefethen book

WARNING: do never call this function because unmqr and unmlq are not implementing all the cases required in static.

Definition at line 26 of file pcgerbb.c.

References A, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_desc_submatrix(), plasma_pcgelqf(), plasma_pcgeqrf(), plasma_pcunmlq(), plasma_pcunmqr(), plasma_static_call_4, plasma_static_call_7, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaConjTrans, PlasmaLeft, PlasmaRight, plasma_sequence_t::status, and T.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k;
int tempkm, tempkn;
plasma_unpack_args_4(A, T, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
if (A.m >= A.n){
for (k = 0; k < A.nt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
if (k+1 < A.nt){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
}
else{
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
if (k+1 < A.mt){
tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
PLASMA_desc, plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
PLASMA_desc, plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgerbb_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile BAND Bidiagonal Reduction - dynamic scheduler Could be optimized by using the algorithms from Trefethen book

Definition at line 127 of file pcgerbb.c.

References plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_desc_submatrix(), plasma_pcgelqf_quark(), plasma_pcgeqrf_quark(), plasma_pcunmlq_quark(), plasma_pcunmqr_quark(), PlasmaConjTrans, PlasmaLeft, and PlasmaRight.

{
int k;
int tempkm, tempkn;
if (A.m >= A.n){
for (k = 0; k < A.nt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
if (k+1 < A.nt){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, k*T.mb, (k+1)*T.nb, tempkm, T.n-(k+1)*T.nb),
sequence, request);
}
}
}
else{
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
sequence, request);
plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
plasma_desc_submatrix(T, k*T.mb, k*T.nb, tempkm, T.n-k*T.nb),
sequence, request);
if (k+1 < A.mt){
tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
plasma_desc_submatrix(A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
plasma_desc_submatrix(A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
plasma_desc_submatrix(T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
}
}
}
}

Here is the call graph for this function:

void plasma_pcgerbbrh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pcgerbh_quark ( PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pcgetmi2 ( plasma_context_t plasma)

plasma_pcgetmi2 - realises nprob independant transpositions. Each subproblem is a tile of mb-by-nb elements. This function use an extra space of PLASMA_SIZE*(mb*nb).

Parameters:
[in]plasmaPlasma context to which this call belong to.
See also:
plasma_pcgetmi2_quark

Definition at line 40 of file pcgetmi2.c.

References A, CORE_cgetrip(), plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_10, PlasmaComplexFloat, plasma_sequence_t::status, and storev.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
PLASMA_Complex32_t *A, *Al, *work;
PLASMA_enum storev, idep, odep;
int i, m, n, mb, nb, nprob;
int size, bsiz;
plasma_unpack_args_10(idep, odep, storev, m, n, mb, nb, A, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
/* quick return */
if( (mb < 2) || (nb < 2) ) {
return ;
}
size = PLASMA_SIZE;
bsiz = mb*nb;
nprob = ( m / mb ) * ( n / nb );
for (i=PLASMA_RANK; i<nprob; i+=size) {
Al = &(A[ i * bsiz]);
CORE_cgetrip(mb, nb, Al, work);
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

void plasma_pcgetmi2_quark ( PLASMA_enum  idep,
PLASMA_enum  odep,
PLASMA_enum  storev,
int  m,
int  n,
int  mb,
int  nb,
PLASMA_Complex32_t A,
PLASMA_sequence sequence,
PLASMA_request request 
)

plasma_pcgetmi2_quark - realises nprob independant transpositions. Each subproblem is a tile of mb-by-nb elements. This function use an extra space of PLASMA_SIZE*(mb*nb). This is a maximum in case of dynamic scheduling.

Parameters:
[in]idepPlasmaIPT_Nodep: No fake dependencies are added. PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. PlasmaIPT_All: A gatherv is added on the whole matrix.
[in]odepPlasmaIPT_Nodep: No fake dependencies are added. PlasmaIPT_Panel: A gatherv is added on each panel and panel size is m*nb. PlasmaIPT_All: A gatherv is added on the whole matrix.
[in]storevPlasmaColumnWise: Data stored in column major. PlasmaRowWise: Data stored in row major.
[in]mNumber of row of A if tiles are sorted in column major format, number of columns otherwise.
[in]nNumber of columns of A if tiles are sorted in column major format, number of rows otherwise.
[in]mbNumber of rows in each individual subproblem if storev == PlasmaColumnWise, number of columns otherwise. mmb must be 0.
[in]nbNumber of columns in each individual subproblem if storev == PlasmaColumnWise, number of rows otherwise. nnb must be 0.
[in,out]AMatrix of size m*n.
[in]sequenceIdentifies the sequence of function calls that this call belongs to (for completion checks and exception handling purposes).
[out]requestIdentifies this function call (for exception handling purposes).
See also:
plasma_pcgetmi2

Definition at line 128 of file pcgetmi2.c.

References GATHERV, INOUT, INPUT, plasma_context_self(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaIPT_All, PlasmaIPT_NoDep, PlasmaIPT_Panel, plasma_context_struct::quark, QUARK_CORE_cgetrip(), QUARK_CORE_cgetrip_f1(), QUARK_CORE_cgetrip_f2(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int i, j, nprob, mt, nt;
int bsiz, psiz, size;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
/* quick return */
if( (mb < 2) || (nb < 2) ) {
return ;
}
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
bsiz = mb*nb;
psiz = m*nb;
mt = ( m / mb );
nt = ( n / nb );
} else {
psiz = n*mb;
mt = ( n / nb );
nt = ( m / mb );
}
size = m*n;
switch ( idep ) {
/*
* Dependencies on each panel as input
*/
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INOUT|GATHERV);
}
}
break;
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
QUARK_CORE_cgetrip_f2(plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, size, INPUT,
A, size, INOUT|GATHERV);
}
}
break;
default:
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INPUT);
}
}
}
break;
/*
* Dependency on all the matrix as input
*/
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
A, size, INPUT,
Ap, psiz, INOUT|GATHERV);
}
}
break;
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INOUT|GATHERV);
}
break;
default:
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INPUT);
}
}
break;
/*
* No Dependencies as input
*/
default:
switch ( odep ) {
for (j=0; j<nt; j++) {
Ap = A + (psiz*j);
for (i=0; i<mt; i++) {
Al = Ap + i*bsiz;
plasma->quark, &task_flags,
mb, nb, Al, bsiz,
Ap, psiz, INOUT|GATHERV);
}
}
break;
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_cgetrip_f1(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz,
A, size, INOUT|GATHERV);
}
break;
default:
nprob = mt*nt;
for (i=0; i<nprob; i++) {
QUARK_CORE_cgetrip(plasma->quark, &task_flags,
mb, nb, &(A[ i*bsiz ]), bsiz);
}
}
}
}

Here is the call graph for this function:

void plasma_pcgetrf_incpiv ( plasma_context_t plasma)

Parallel tile LU factorization - static scheduling

Definition at line 25 of file pcgetrf_incpiv.c.

References A, BLKLDD, CORE_cgessm(), CORE_cgetrf_incpiv(), CORE_cssssm(), CORE_ctstrf(), plasma_desc_t::dtyp, IPIV, L, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_IB, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, ss_abort, ss_aborted, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, and plasma_sequence_t::status.

{
int *IPIV;
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam;
int info;
int tempkn, tempkm, tempmm, tempnn;
int ib = PLASMA_IB;
plasma_unpack_args_5(A, L, IPIV, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
work = (PLASMA_Complex32_t*)plasma_private_alloc(plasma, ib*L.nb, L.dtyp);
ss_init(A.mt, A.nt, -1);
k = 0;
while (n >= A.nt) {
k++;
n = n-A.nt+k;
}
m = k;
while (k < min(A.mt, A.nt) && n < A.nt && !ss_aborted()) {
next_n = n;
next_m = m;
next_k = k;
next_m++;
if (next_m == A.mt) {
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_k < min(A.mt, A.nt)) {
next_k++;
next_n = next_n-A.nt+next_k;
}
next_m = next_k;
}
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldam = BLKLDD(A, m);
if (n == k) {
if (m == k) {
ss_cond_wait(k, k, k-1);
tempkm, tempkn, ib,
A(k, k), ldak,
IPIV(k, k), &info);
if (info != 0 && m == A.mt-1) {
plasma_request_fail(sequence, request, info + A.nb*k);
}
ss_cond_set(k, k, k);
}
else {
ss_cond_wait(m, k, k-1);
tempmm, tempkn, ib, A.nb,
A(k, k), ldak,
A(m, k), ldam,
L(m, k), L.mb,
IPIV(m, k),
work, L.nb, &info);
if (info != 0 && m == A.mt-1) {
plasma_request_fail(sequence, request, info + A.nb*k);
}
ss_cond_set(m, k, k);
}
}
else {
if (m == k) {
ss_cond_wait(k, k, k);
ss_cond_wait(k, n, k-1);
tempkm, tempnn, tempkm, ib,
IPIV(k, k),
A(k, k), ldak,
A(k, n), ldak);
}
else {
ss_cond_wait(m, k, k);
ss_cond_wait(m, n, k-1);
A.nb, tempnn, tempmm, tempnn, A.nb, ib,
A(k, n), ldak,
A(m, n), ldam,
L(m, k), L.mb,
A(m, k), ldam,
IPIV(m, k));
ss_cond_set(m, n, k);
}
}
n = next_n;
m = next_m;
k = next_k;
}
plasma_private_free(plasma, work);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcgetrf_incpiv_quark ( PLASMA_desc  A,
PLASMA_desc  L,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile LU factorization - dynamic scheduling

Definition at line 143 of file pcgetrf_incpiv.c.

References A, BLKLDD, IPIV, L, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_cgessm(), QUARK_CORE_cgetrf_incpiv(), QUARK_CORE_cssssm(), QUARK_CORE_ctstrf(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
for (k = 0; k < min(A.mt, A.nt); k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
plasma->quark, &task_flags,
tempkm, tempkn, ib, L.nb,
A(k, k), ldak, IPIV(k, k),
sequence, request,
k == A.mt-1, A.nb*k);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempkm, tempnn, tempkm, ib, L.nb,
IPIV(k, k),
A(k, k), ldak,
A(k, n), ldak);
}
for (m = k+1; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, tempkn, ib, L.nb,
A(k, k), ldak,
A(m, k), ldam,
L(m, k), L.mb,
IPIV(m, k),
sequence, request,
m == A.mt-1, A.nb*k);
for (n = k+1; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempmm, tempnn, A.nb, ib, L.nb,
A(k, n), ldak,
A(m, n), ldam,
L(m, k), L.mb,
A(m, k), ldam,
IPIV(m, k));
}
}
}
}

Here is the call graph for this function:

void plasma_pcgetrf_reclap_quark ( PLASMA_desc  A,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pcgetrf_rectil_quark ( PLASMA_desc  A,
int *  IPIV,
PLASMA_sequence sequence,
PLASMA_request request 
)
void plasma_pchbrdt_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
float *  D,
float *  E,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler

Definition at line 28 of file pchbrdt.c.

References A, C, cabsf(), DEP, plasma_desc_t::dtyp, lapack_const, plasma_desc_t::lm, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_context_self(), plasma_element_size(), plasma_sequence_flush(), plasma_shared_alloc(), plasma_shared_free(), PLASMA_SUCCESS, PlasmaComplexFloat, PlasmaInteger, PlasmaLower, plasma_context_struct::quark, QUARK_Barrier(), QUARK_CORE_ctrdalg(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
#ifdef COMPLEX
static float dzero = (float) 0.0;
float absztmp;
#endif
int N, NB, INgrsiz, INthgrsiz, BAND;
int myid, grsiz, shift=3, stt, st, ed, stind, edind;
int blklastind, colpt, PCOL, ACOL, MCOL;
int stepercol, mylastid, grnb, grid;
int *DEP,*MAXID;
int i, j, m;
int thgrsiz, thgrnb, thgrid, thed;
size_t eltsize = plasma_element_size(A.dtyp);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
N = A.m;
NB = A.mb;
/* Quick return */
if (N == 0){
return;
}
if (NB == 0) {
memset(D, 0, N*sizeof(float));
memset(E, 0, (N-1)*sizeof(float));
#ifdef COMPLEX
for (i=0; i<N; i++)
D[i] = cabsf(*A(i,i));
#else
for (i=0; i<N; i++)
D[i] = *A(i,i);
#endif
return;
}
/*
* Barrier is used because the bulge have to wait until
* the reduction to band has been finish.
* otherwise, I can remove this BARRIER when I integrate
* the function dependencies link inside the reduction to
* band. Keep in min the case when NB=1, where no bulge-chasing.
*/
/***************************************************************/
QUARK_Barrier(plasma->quark);
/***************************************************************/
/*
* Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
* Make diagonal and superdiagonal elements real, storing them in
* D and E. if PlasmaLower, first transform lower bidiagonal form
* to upper bidiagonal by applying plane rotations/ Householder
* from the left, overwriting superdiagonal elements then make
* elements real of the resulting upper Bidiagonal. if PlasmaUpper
* then make its elements real. For Q, PT: ZSCAL should be done
* in case of WANTQ.
*/
if (NB == 1){
memset(D, 0, N *sizeof(float));
memset(E, 0, (N-1)*sizeof(float));
#ifdef COMPLEX
for (i=0; i<N; i++)
{
D[i] = crealf( *A(i, i) ); /* diag value */
if( i < (N-1)) { /* lower off-diag value */
ztmp = *A((i+1),i);
absztmp = cabsf(ztmp);
*A((i+1),i) = absztmp;
E[i] = absztmp;
if(absztmp != dzero)
ztmp = (PLASMA_Complex32_t) (ztmp / absztmp);
else
ztmp = zone;
if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
/* for Q: ZSCAL should be done in case of WANTQ */
}
}
} else { /* PlasmaUpper */
for (i=0; i<N; i++)
{
D[i] = crealf( *A(i,i) ); /* diag value*/
if(i<(N-1)) { /* lower off-diag value */
ztmp = *A(i, (i+1));
absztmp = cabsf(ztmp);
*A(i,(i+1)) = absztmp;
E[i] = absztmp;
if(absztmp != dzero)
ztmp = (PLASMA_Complex32_t) (ztmp / absztmp);
else
ztmp = zone;
if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
/* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
}
}
} /* end PlasmaUpper*/
#else
if( uplo == PlasmaLower ){
for (i=0; i < N-1; i++) {
D[i] = *A(i, i);
E[i] = *A(i+1, i);
}
D[i] = *A(i, i);
} else {
for (i=0; i < N-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
}
#endif
return;
}
/* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
if( N <= 0 ) /* this will be removed we don t need it. */
{
PLASMA_Complex32_t *work, *TTau;
int info, ldwork = N*N;
info = LAPACKE_chetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
A(0,0), A.lm, D, E, TTau, work, ldwork);
plasma_shared_free(plasma, (void*) work);
plasma_shared_free(plasma, (void*) TTau);
if( info == 0 )
sequence->status = PLASMA_SUCCESS;
else
plasma_sequence_flush(plasma->quark, sequence, request, info);
return;
}
/* General case NB > 1 && N > NB */
DEP = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger );
MAXID = (int *) plasma_shared_alloc(plasma, N+1, PlasmaInteger );
memset(MAXID,0,(N+1)*sizeof(int));
/***************************************************************************
* START BULGE CHASING CODE
**************************************************************************/
/*
* Initialisation of local parameter. those parameter should be
* input or tuned parameter.
*/
INgrsiz = 1;
if( NB > 160 ) {
INgrsiz = 2;
}
else if( NB > 100 ) {
if( N < 5000 )
INgrsiz = 2;
else
INgrsiz = 4;
} else {
INgrsiz = 6;
}
INthgrsiz = N;
BAND = 0;
grsiz = INgrsiz;
thgrsiz = INthgrsiz;
if( grsiz == 0 ) grsiz = 6;
if( thgrsiz == 0 ) thgrsiz = N;
i = shift/grsiz;
stepercol = i*grsiz == shift ? i:i+1;
i = (N-2)/thgrsiz;
thgrnb = i*thgrsiz == (N-2) ? i:i+1;
for (thgrid = 1; thgrid<=thgrnb; thgrid++){
stt = (thgrid-1)*thgrsiz+1;
thed = min( (stt + thgrsiz -1), (N-2));
for (i = stt; i <= N-2; i++){
ed=min(i,thed);
if(stt>ed)break;
for (m = 1; m <=stepercol; m++){
st=stt;
for (j = st; j <=ed; j++){
/* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */
/* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */
/* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
mylastid = myid+grsiz-1;
PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group */
MAXID[j] = myid;
PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
grnb = PCOL/grsiz;
grid = grnb*grsiz == PCOL ? grnb:grnb+1;
PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
ACOL = myid-grsiz;
if(myid==1)ACOL=0;
MCOL = myid;
plasma->quark, &task_flags,
uplo, N, NB,
&A, C, S, i, j, m, grsiz, BAND,
DEP(PCOL), DEP(ACOL), DEP(MCOL) );
if(mylastid%2 ==0){
blklastind = (mylastid/2)*NB+1+j-1;
}else{
colpt = ((mylastid+1)/2)*NB + 1 +j -1 ;
stind = colpt-NB+1;
edind = min(colpt,N);
if( (stind>=edind-1) && (edind==N) )
blklastind=N;
else
blklastind=0;
}
if(blklastind >= (N-1)) stt=stt+1;
} /* END for j=st:ed */
} /* END for m=1:stepercol */
} /* END for i=1:MINMN-2 */
} /* END for thgrid=1:thgrnb */
/*
* Barrier used only for now, to be sure that everything
* is done before copying the D and E and free workspace.
* this will be removed later when D and E are directly filled
* during the bulge process.
*/
QUARK_Barrier(plasma->quark);
plasma_shared_free(plasma, (void*) DEP);
plasma_shared_free(plasma, (void*) MAXID);
plasma_shared_free(plasma, (void*) C);
plasma_shared_free(plasma, (void*) S);
/*
* STORE THE RESULTING diagonal/off-diagonal in D AND E
*/
memset(D, 0, N *sizeof(float));
memset(E, 0, (N-1)*sizeof(float));
/* Make diagonal and superdiagonal elements real,
* storing them in D and E
*/
/* In complex case, the off diagonal element are
* not necessary real. we have to make off-diagonal
* elements real and copy them to E.
* When using HouseHolder elimination,
* the ZLARFG give us a real as output so, all the
* diagonal/off-diagonal element except the last one are already
* real and thus we need only to take the abs of the last
* one.
* */
#ifdef COMPLEX
for (i=0; i < N-1 ; i++)
{
D[i] = crealf( *A(i,i) );
/*
* Alternative for Householder case, all off-diag
* are real except the last off-diag, where we
* have to take the abs
*/
if(i<(N-2))
E[i] = crealf(*A(i+1, i));
else
E[i] = cabsf( *A(i+1, i));
}
D[i] = crealf( *A(i, i) );
} else { /* PlasmaUpper */
for (i=0; i<N-1; i++)
{
D[i] = crealf( *A(i,i) );
/*
* Alternative for Householder case, all off-diag
* are real except the last off-diag, where we
* have to take the abs
*/
if( i < (N-2) )
E[i] = crealf(*A(i, (i+1)));
else
E[i] = cabsf(*A(i, (i+1)));
}
D[i] = crealf( *A(i, i) );
} /* end PlasmaUpper */
#else
if( uplo == PlasmaLower ){
for (i=0; i < N-1; i++) {
D[i] = *A(i, i);
E[i] = *A(i+1, i);
}
D[i] = *A(i, i);
} else {
for (i=0; i < N-1; i++) {
D[i] = *A(i, i );
E[i] = *A(i, i+1);
}
D[i] = *A(i, i);
}
#endif
} /* END FUNCTION */

Here is the call graph for this function:

void plasma_pchegst_quark ( PLASMA_enum  itype,
PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel Transformation to standard eigenvalue problem - dynamic scheduler

Definition at line 22 of file pchegst.c.

References B, BLKLDD, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_desc_submatrix(), plasma_pchemm_quark(), plasma_pcher2k_quark(), plasma_pctrmm_quark(), plasma_pctrsm_quark(), PLASMA_SUCCESS, PlasmaConjTrans, PlasmaLeft, PlasmaLower, PlasmaNonUnit, PlasmaNoTrans, PlasmaRight, plasma_context_struct::quark, QUARK_CORE_chegst(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int k;
int ldak, ldbk;
int tempkn;
static float done = 1.0;
static PLASMA_Complex32_t zone = 1.0;
static PLASMA_Complex32_t mzone = -1.0;
static PLASMA_Complex32_t zhalf = 0.5;
static PLASMA_Complex32_t mzhalf = -0.5;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if (itype == 1) {
if (uplo == PlasmaLower) {
for (k = 0; k < A.nt; k++){
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
plasma->quark, &task_flags,
itype, uplo, tempkn,
A(k, k), ldak,
B(k, k), ldbk,
sequence, request, A.nb*k);
if (k*A.nb+tempkn < A.n) {
zone,
plasma_desc_submatrix(B, k*B.nb, k*B.nb, tempkn, tempkn),
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb, A.n-k*A.nb-tempkn, tempkn),
sequence, request);
PlasmaRight, uplo, mzhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb+tempkn, k*B.nb, B.n-k*B.nb-tempkn, tempkn),
zone,
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb, A.n-k*A.nb-tempkn, tempkn),
sequence, request);
mzone,
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb, A.n-k*A.nb-tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb+tempkn, k*B.nb, B.n-k*B.nb-tempkn, tempkn),
done,
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb+tempkn, A.n-k*A.nb-tempkn, A.n-k*A.nb-tempkn),
sequence, request);
mzhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb+tempkn, k*B.nb, B.n-k*B.nb-tempkn, tempkn),
zone,
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb, A.n-k*A.nb-tempkn, tempkn),
sequence, request);
zone,
plasma_desc_submatrix(B, k*B.nb+tempkn, k*B.nb+tempkn, tempkn, tempkn),
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb, A.n-k*A.nb-tempkn, tempkn),
sequence, request);
}
}
}
else {
for (k = 0; k < A.nt; k++){
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
plasma->quark, &task_flags,
itype, uplo, tempkn,
A(k, k), ldak,
B(k, k), ldbk,
sequence, request, A.nb*k);
if (k*A.nb+tempkn < A.n) {
zone,
plasma_desc_submatrix(B, k*B.nb, k*B.nb, tempkn, tempkn),
plasma_desc_submatrix(A, k*A.nb, k*A.nb+tempkn, tempkn, A.n-k*A.nb-tempkn),
sequence, request);
PlasmaLeft, uplo, mzhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb, k*B.nb+tempkn, tempkn, B.n-k*B.nb-tempkn),
zone,
plasma_desc_submatrix(A, k*A.nb, k*A.nb+tempkn, tempkn, A.n-k*A.nb-tempkn),
sequence, request);
mzone,
plasma_desc_submatrix(A, k*A.nb, k*A.nb+tempkn, tempkn, A.n-k*A.nb-tempkn),
plasma_desc_submatrix(B, k*B.nb, k*B.nb+tempkn, tempkn, B.n-k*B.nb-tempkn),
done,
plasma_desc_submatrix(A, k*A.nb+tempkn, k*A.nb+tempkn, A.n-k*A.nb-tempkn, A.n-k*A.nb-tempkn),
sequence, request);
mzhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb, k*B.nb+tempkn, tempkn, B.n-k*B.nb-tempkn),
zone,
plasma_desc_submatrix(A, k*A.nb, k*A.nb+tempkn, tempkn, A.n-k*A.nb-tempkn),
sequence, request);
zone,
plasma_desc_submatrix(B, k*B.nb+tempkn, k*B.nb+tempkn, tempkn, tempkn),
plasma_desc_submatrix(A, k*A.nb, k*A.nb+tempkn, tempkn, A.n-k*A.nb-tempkn),
sequence, request);
}
}
}
}
else{
if (uplo == PlasmaLower) {
for (k = 0; k < A.nt; k++){
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zone,
plasma_desc_submatrix(B, 0, 0, k*B.nb, k*B.nb),
plasma_desc_submatrix(A, k*A.nb, 0, tempkn, k*A.nb),
sequence, request);
PlasmaLeft, uplo, zhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb, 0, tempkn, k*B.nb),
zone,
plasma_desc_submatrix(A, k*A.nb, 0, tempkn, k*A.nb),
sequence, request);
zone,
plasma_desc_submatrix(A, k*A.nb, 0, tempkn, k*A.nb),
plasma_desc_submatrix(B, k*B.nb, 0, tempkn, k*B.nb),
done,
plasma_desc_submatrix(A, 0, 0, k*A.nb, k*A.nb),
sequence, request);
PlasmaLeft, uplo, zhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, tempkn, tempkn),
plasma_desc_submatrix(B, k*B.nb, 0, tempkn, k*B.nb),
zone,
plasma_desc_submatrix(A, k*A.nb, 0, tempkn, k*A.nb),
sequence, request);
zone,
plasma_desc_submatrix(B, k*B.nb, k*B.nb, tempkn, tempkn),
plasma_desc_submatrix(A, k*A.nb, 0, tempkn, k*A.nb),
sequence, request);
plasma->quark, &task_flags,
itype, uplo, tempkn,
A(k, k), ldak,
B(k, k), ldbk,
sequence, request, A.nb*k);
}
}
else {
for (k = 0; k < A.nt; k++){
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
zone,
plasma_desc_submatrix(B, 0, 0, k*B.nb, k*B.nb),
plasma_desc_submatrix(A, 0, k*A.nb, k*A.nb, tempkn),
sequence, request);
PlasmaRight, uplo, zhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, k*A.nb, k*A.nb),
plasma_desc_submatrix(B, 0, k*B.nb, k*B.nb, tempkn),
zone,
plasma_desc_submatrix(A, 0, k*A.nb, k*A.nb, tempkn),
sequence, request);
zone,
plasma_desc_submatrix(A, 0, k*A.nb, k*A.nb, tempkn),
plasma_desc_submatrix(B, 0, k*B.nb, k*B.nb, tempkn),
done,
plasma_desc_submatrix(A, 0, 0, k*A.nb, k*A.nb),
sequence, request);
PlasmaRight, uplo, zhalf,
plasma_desc_submatrix(A, k*A.nb, k*A.nb, k*A.nb, k*A.nb),
plasma_desc_submatrix(B, 0, k*B.nb, k*B.nb, tempkn),
zone,
plasma_desc_submatrix(A, 0, k*A.nb, k*A.nb, tempkn),
sequence, request);
zone,
plasma_desc_submatrix(B, k*B.nb, k*B.nb, tempkn, tempkn),
plasma_desc_submatrix(A, 0, k*A.nb, k*A.nb, tempkn),
sequence, request);
plasma->quark, &task_flags,
itype, uplo, tempkn,
A(k, k), ldak,
B(k, k), ldbk,
sequence, request, A.nb*k);
}
}
}
}

Here is the call graph for this function:

void plasma_pcherbt_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_desc  T,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile BAND Tridiagonal Reduction - dynamic scheduler

Definition at line 23 of file pcherbt.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_IB, PLASMA_SUCCESS, PlasmaConjTrans, PlasmaLeft, PlasmaLower, PlasmaNoTrans, PlasmaRight, PlasmaUpper, plasma_context_struct::quark, QUARK_CORE_cgelqt(), QUARK_CORE_cgeqrt(), QUARK_CORE_cherfb(), QUARK_CORE_ctslqt(), QUARK_CORE_ctsmlq(), QUARK_CORE_ctsmlq_corner(), QUARK_CORE_ctsmlq_hetra1(), QUARK_CORE_ctsmqr(), QUARK_CORE_ctsmqr_corner(), QUARK_CORE_ctsmqr_hetra1(), QUARK_CORE_ctsqrt(), QUARK_CORE_cunmlq(), QUARK_CORE_cunmqr(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, T, and TASK_SEQUENCE.

{
int k, m, n, i, j;
int ldak, ldam, ldan, ldaj, ldai;
int tempkn, tempmm, tempnn, tempjj;
int ib;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
ib = PLASMA_IB;
if (uplo == PlasmaLower) {
for (k = 0; k < A.nt-1; k++){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
ldak = BLKLDD(A, k+1);
plasma->quark, &task_flags,
tempkn, A.nb, ib, T.nb,
A(k+1, k), ldak,
T(k+1, k), T.mb);
/* LEFT and RIGHT on the symmetric diagonal block */
plasma->quark, &task_flags,
tempkn, tempkn, ib, T.nb,
A(k+1, k), ldak,
T(k+1, k), T.mb,
A(k+1, k+1), ldak);
/* RIGHT on the remaining tiles until the bottom */
for (m = k+2; m < A.mt ; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, tempkn, ib, T.nb,
A(k+1, k), ldak,
T(k+1, k), T.mb,
A(m , k+1), ldam);
}
for (m = k+2; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
tempmm, A.nb, ib, T.nb,
A(k+1, k), ldak,
A(m , k), ldam,
T(m , k), T.mb);
/* LEFT */
for (i = k+2; i < m; i++) {
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
A.mb, A.nb, tempmm, A.nb, A.nb, ib, T.nb,
A(i, k+1), ldai,
A(m, i), ldam,
A(m, k), ldam,
T(m, k), T.mb);
}
/* RIGHT */
for (j = m+1; j < A.mt ; j++) {
tempjj = j == A.mt-1 ? A.m-j*A.mb : A.mb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
tempjj, A.nb, tempjj, tempmm, A.nb, ib, T.nb,
A(j, k+1), ldaj,
A(j, m), ldaj,
A(m, k), ldam,
T(m, k), T.mb);
}
/* LEFT->RIGHT */
plasma->quark, &task_flags,
A.nb, A.nb, tempmm, A.nb, tempmm, tempmm, A.nb, ib, T.nb,
A(k+1, k+1), ldak,
A(m , k+1), ldam,
A(m , m), ldam,
A(m , k), ldam,
T(m , k), T.mb);
}
}
}
else {
for (k = 0; k < A.nt-1; k++){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
ldak = BLKLDD(A, k+1);
plasma->quark, &task_flags,
A.nb, tempkn, ib, T.nb,
A(k, k+1), A.nb,
T(k, k+1), T.mb);
/* RIGHT and LEFT on the symmetric diagonal block */
plasma->quark, &task_flags,
tempkn, tempkn, ib, T.nb,
A(k, k+1), A.nb,
T(k, k+1), T.mb,
A(k+1, k+1), ldak);
/* LEFT on the remaining tiles until the left side */
for (n = k+2; n < A.nt ; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
A.nb, tempnn, tempkn, ib, T.nb,
A(k, k+1), A.nb,
T(k, k+1), T.mb,
A(k+1, n), ldak);
}
for (n = k+2; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldan = BLKLDD(A, n);
plasma->quark, &task_flags,
A.nb, tempnn, ib, T.nb,
A(k, k+1), A.nb,
A(k, n), A.nb,
T(k, n), T.mb);
/* RIGHT */
for (i = k+2; i < n; i++) {
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
A.mb, A.nb, A.nb, tempnn, A.nb, ib, T.nb,
A(k+1, i), ldak,
A(i, n), ldai,
A(k, n), A.nb,
T(k, n), T.mb);
}
/* LEFT */
for (j = n+1; j < A.nt ; j++) {
tempjj = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
A.nb, tempjj, tempnn, tempjj, A.nb, ib, T.nb,
A(k+1, j), ldak,
A(n, j), ldan,
A(k, n), A.nb,
T(k, n), T.mb);
}
/* RIGHT->LEFT */
plasma->quark, &task_flags,
A.nb, A.nb, A.nb, tempnn, tempnn, tempnn, A.nb, ib, T.nb,
A(k+1, k+1), ldak,
A(k+1, n), ldak,
A(n , n), ldan,
A(k , n), A.nb,
T(k , n), T.mb);
}
}
}
}

Here is the call graph for this function:

void plasma_pclacpy ( plasma_context_t plasma)

Definition at line 23 of file pclacpy.c.

References A, B, BLKLDD, CORE_clacpy(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_sequence_t::status, and uplo.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int next_m;
int next_n;
int ldam, ldbm;
plasma_unpack_args_5(uplo, A, B, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
switch (uplo) {
/*
* PlasmaUpper
*/
m = 0;
while (n >= A.nt) {
m++;
n = n - A.nt + m;
}
while (m < A.mt) {
next_m = m;
next_n = n;
next_n += PLASMA_SIZE;
while (next_n >= A.nt && next_m < A.mt) {
next_m++;
next_n = next_n - A.nt + next_m;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
m == n ? uplo : PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
/*
* PlasmaLower
*/
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt + n;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m - A.mt + next_n;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
m == n ? uplo : PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
/*
* PlasmaUpperLower
*/
case PlasmaUpperLower:
default:
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m - A.mt;
}
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
PlasmaUpperLower,
X, Y,
A(m, n), ldam,
B(m, n), ldbm);
n = next_n;
m = next_m;
}
break;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pclacpy_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_desc  B,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 153 of file pclacpy.c.

References B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_clacpy(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
switch (uplo) {
/*
* PlasmaUpper
*/
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
if (m < A.nt) {
Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, m), ldam,
B(m, m), ldbm);
}
for (n = m+1; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
break;
/*
* PlasmaLower
*/
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
if (m < A.nt) {
Y = m == A.nt-1 ? A.n-m*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, m), ldam,
B(m, m), ldbm);
}
for (n = 0; n < min(m, A.nt); n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
break;
/*
* PlasmaUpperLower
*/
default:
for (m = 0; m < A.mt; m++) {
X = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
ldbm = BLKLDD(B, m);
for (n = 0; n < A.nt; n++) {
Y = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
X, Y, A.mb,
A(m, n), ldam,
B(m, n), ldbm);
}
}
}
}

Here is the call graph for this function:

void plasma_pclag2z ( plasma_context_t plasma)

Definition at line 111 of file pzlag2c.c.

References B, BLKLDD, CORE_clag2z(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, SA, and plasma_sequence_t::status.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int X, Y;
int m, n;
int ldam, ldbm;
int next_m;
int next_n;
plasma_unpack_args_4(SA, B, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= SA.mt && n < SA.nt) {
n++;
m = m-SA.mt;
}
while (n < SA.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= SA.mt && next_n < SA.nt) {
next_n++;
next_m = next_m-SA.mt;
}
X = m == SA.mt-1 ? SA.m-SA.mb*m : SA.nb;
Y = n == SA.nt-1 ? SA.n-SA.nb*n : SA.nb;
ldam = BLKLDD(SA, m);
ldbm = BLKLDD(B, m);
CORE_clag2z(X, Y, SA(m, n), ldam, B(m, n), ldbm);
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

void plasma_pclag2z_quark ( PLASMA_desc  A,
PLASMA_desc  SB,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 159 of file pzlag2c.c.

References B, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_clag2z(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, Y;
int m, n;
int ldam, ldbm;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for(m = 0; m < SA.mt; m++) {
X = m == SA.mt-1 ? SA.m-m*SA.mb : SA.mb;
ldam = BLKLDD(SA, m);
ldbm = BLKLDD(B, m);
for(n = 0; n < SA.nt; n++) {
Y = n == SA.nt-1 ? SA.n-n*SA.nb : SA.nb;
plasma->quark, &task_flags,
X, Y, SA.mb,
SA(m, n), ldam,
B(m, n), ldbm);
}
}
}

Here is the call graph for this function:

void plasma_pclange ( plasma_context_t plasma)

Definition at line 24 of file pclange.c.

References A, BLKLDD, CORE_clange(), CORE_scasum(), CORE_slange(), plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, max, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, norm, plasma_desc_t::nt, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, plasma_unpack_args_6, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, ss_cond_set, ss_cond_wait, ss_finalize, and ss_init.

{
float *work;
float *result;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam;
int step, lrank;
int X, X1, X2, Y, Y1, Y2;
float* lwork;
float normtmp, normtmp2;
plasma_unpack_args_6(norm, A, work, result, sequence, request);
*result = 0.0;
if (PLASMA_RANK == 0)
memset(work, 0, PLASMA_SIZE*sizeof(float));
switch (norm) {
/*
* PlasmaMaxNorm
*/
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt;
}
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldam = BLKLDD(A, m);
CORE_clange(PlasmaMaxNorm, X, Y, A(m, n, X1, Y1, ldam), ldam, NULL, &normtmp);
if (normtmp > work[PLASMA_RANK])
work[PLASMA_RANK] = normtmp;
m = next_m;
n = next_n;
}
break;
/*
* PlasmaOneNorm
*/
normtmp2 = 0.0;
lwork = (float*)plasma_private_alloc(plasma, A.nb, PlasmaRealDouble);
while (n < A.nt) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
memset(lwork, 0, A.nb*sizeof(float));
for (m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
X, Y,
A(m, n, X1, Y1, ldam), ldam,
lwork);
}
CORE_slange(PlasmaMaxNorm, Y, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaInfNorm
*/
normtmp2 = 0.0;
lwork = (float*)plasma_private_alloc(plasma, A.mb, PlasmaRealDouble);
while (m < A.mt) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
memset(lwork, 0, A.mb*sizeof(float));
for (n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
X, Y,
A(m, n, X1, Y1, ldam), ldam,
lwork);
}
CORE_slange(PlasmaMaxNorm, X, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
if (norm != PlasmaFrobeniusNorm) {
step = 1;
lrank = PLASMA_RANK;
while ( (lrank%2 == 0) && (PLASMA_RANK+step < PLASMA_SIZE) ) {
ss_cond_wait(PLASMA_RANK+step, 0, step);
work[PLASMA_RANK] = max(work[PLASMA_RANK], work[PLASMA_RANK+step]);
lrank = lrank >> 1;
step = step << 1;
}
if (PLASMA_RANK > 0) {
while( lrank != 0 ) {
if (lrank%2 == 1) {
lrank = 0;
} else {
lrank = lrank >> 1;
step = step << 1;
}
}
}
if (PLASMA_RANK == 0)
*result = work[0];
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pclange_quark ( PLASMA_enum  norm,
PLASMA_desc  A,
float *  work,
float *  result,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 202 of file pclange.c.

References BLKLDD, plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_shared_alloc(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_clange_f1(), QUARK_CORE_free(), QUARK_CORE_scasum_f1(), QUARK_CORE_slange(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, X1, X2, Y, Y1, Y2;
int ldam;
int m, n;
int szeW;
float* lwork;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
*result = 0.0;
switch ( norm ) {
/*
* PlasmaMaxNorm
*/
szeW = A.mt*A.nt;
lwork = (float*)plasma_shared_alloc(plasma, szeW, PlasmaRealDouble);
memset(lwork, 0, szeW*sizeof(float));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
0, &(lwork[A.mt*n+m]),
lwork, szeW);
}
}
plasma->quark, &task_flags,
lwork, A.mt, szeW,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, szeW*sizeof(PLASMA_Complex32_t));
break;
/*
* PlasmaOneNorm
*/
lwork = (float*)plasma_shared_alloc(plasma, (A.n+1), PlasmaRealDouble);
memset(lwork, 0, (A.n+1)*sizeof(float));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[n*A.nb+1]), A.nb,
lwork, A.n);
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.n+1, 1,
lwork, 1, A.n+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.n+1)*sizeof(PLASMA_Complex32_t));
break;
/*
* PlasmaInfNorm
*/
lwork = (float*)plasma_shared_alloc(plasma, (A.m+1), PlasmaRealDouble);
memset(lwork, 0, (A.m+1)*sizeof(float));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
for(n = 0; n < A.nt; n++) {
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.m+1, 1,
lwork, 1, A.m+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.m+1)*sizeof(PLASMA_Complex32_t));
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
}

Here is the call graph for this function:

void plasma_pclansy ( plasma_context_t plasma)

Definition at line 24 of file pclansy.c.

References A, BLKLDD, CORE_clange(), CORE_clansy(), CORE_scasum(), CORE_slange(), plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, max, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, norm, plasma_desc_t::nt, plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, plasma_unpack_args_7, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaLower, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, and uplo.

{
float *work;
float *result;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam, ldan;
int step, lrank;
int X, X1, X2, Y, Y1, Y2;
float* lwork;
float normtmp, normtmp2;
plasma_unpack_args_7(norm, uplo, A, work, result, sequence, request);
*result = 0.0;
if (PLASMA_RANK == 0)
memset(work, 0, PLASMA_SIZE*sizeof(float));
switch (norm) {
/*
* PlasmaMaxNorm
*/
n = 0;
while (m >= A.mt && n < A.nt) {
n++;
m = m-A.mt+n;
}
while (n < A.nt) {
next_m = m;
next_n = n;
next_m += PLASMA_SIZE;
while (next_m >= A.mt && next_n < A.nt) {
next_n++;
next_m = next_m-A.mt+next_n;
}
if (m == n) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
CORE_clansy(PlasmaMaxNorm, uplo, X, A(m, n, X1, X1, ldam), ldam, NULL, &normtmp);
}
else {
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = n == 0 ? A.j %A.nb : 0;
Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldam = BLKLDD(A, m);
CORE_clange(PlasmaMaxNorm, X, Y, A(m, n, X1, Y1, ldam), ldam, NULL, &normtmp);
}
/*
* PlasmaUpper
*/
else {
X1 = n == 0 ? A.i %A.mb : 0;
X2 = n == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
Y1 = m == 0 ? A.j %A.nb : 0;
Y2 = m == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
Y = Y2 - Y1;
ldan = BLKLDD(A, n);
CORE_clange(PlasmaMaxNorm, X, Y, A(n, m, X1, Y1, ldan), ldan, NULL, &normtmp);
}
}
if (normtmp > work[PLASMA_RANK])
work[PLASMA_RANK] = normtmp;
m = next_m;
n = next_n;
}
break;
/*
* PlasmaOneNorm / PlasmaInfNorm
*/
normtmp2 = 0.0;
lwork = (float*)plasma_private_alloc(plasma, A.mb, PlasmaRealDouble);
while (m < A.mt) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
memset(lwork, 0, A.mb*sizeof(float));
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for (n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
CORE_scasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, Y1, ldam), ldam, lwork);
}
CORE_scasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
for (n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
ldan = BLKLDD(A, n);
CORE_scasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, 0, X1, ldan), ldan, lwork);
}
}
/*
* PlasmaUpper
*/
else {
for (n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
CORE_scasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, Y1, X1, A.nb), A.nb, lwork);
}
CORE_scasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
for ( n =m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
CORE_scasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, 0, ldam), ldam, lwork);
}
}
CORE_slange(PlasmaMaxNorm, X, 1, lwork, 1, NULL, &normtmp);
if (normtmp > normtmp2)
normtmp2 = normtmp;
}
work[PLASMA_RANK] = normtmp2;
plasma_private_free(plasma, lwork);
break;
/*
* PlasmaFrobeniusNorm
*/
default:;
}
if (norm != PlasmaFrobeniusNorm) {
step = 1;
lrank = PLASMA_RANK;
while ( (lrank%2 == 0) && (PLASMA_RANK+step < PLASMA_SIZE) ) {
ss_cond_wait(PLASMA_RANK+step, 0, step);
work[PLASMA_RANK] = max(work[PLASMA_RANK], work[PLASMA_RANK+step]);
lrank = lrank >> 1;
step = step << 1;
}
if (PLASMA_RANK > 0) {
while( lrank != 0 ) {
if (lrank%2 == 1) {
lrank = 0;
} else {
lrank = lrank >> 1;
step = step << 1;
}
}
}
if (PLASMA_RANK == 0)
*result = work[0];
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pclansy_quark ( PLASMA_enum  norm,
PLASMA_enum  uplo,
PLASMA_desc  A,
float *  work,
float *  result,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 219 of file pclansy.c.

References BLKLDD, plasma_desc_t::i, plasma_desc_t::j, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_shared_alloc(), PLASMA_SUCCESS, PlasmaColumnwise, PlasmaFrobeniusNorm, PlasmaInfNorm, PlasmaLower, PlasmaMaxNorm, PlasmaOneNorm, PlasmaRealDouble, PlasmaRowwise, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_clange_f1(), QUARK_CORE_clansy_f1(), QUARK_CORE_free(), QUARK_CORE_scasum_f1(), QUARK_CORE_slange(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int X, X1, X2, Y, Y1;
int ldam;
int m, n;
int szeW, pos;
float* lwork;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
*result = 0.0;
switch ( norm ) {
/*
* PlasmaMaxNorm
*/
szeW = A.mt*(A.mt+1)/2;
pos = 0;
lwork = (float*)plasma_shared_alloc(plasma, szeW, PlasmaRealDouble);
memset(lwork, 0, szeW*sizeof(float));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
A(m, m, X1, X1, ldam), ldam, ldam*X,
0, &(lwork[pos]),
lwork, szeW);
pos++;
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for(n=0; n<m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
0, &(lwork[pos]),
lwork, szeW);
pos++;
}
}
/*
* PlasmaUpper
*/
else {
for(n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
0, &(lwork[pos]),
lwork, szeW);
pos++;
}
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, szeW, 1,
lwork, 1, szeW,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, szeW*sizeof(PLASMA_Complex32_t));
break;
/*
* PlasmaOneNorm / PlasmaInfNorm
*/
lwork = (float *)plasma_shared_alloc(plasma, A.m+1, PlasmaRealDouble);
memset(lwork, 0, (A.m+1)*sizeof(float));
for(m = 0; m < A.mt; m++) {
X1 = m == 0 ? A.i %A.mb : 0;
X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
X = X2 - X1;
ldam = BLKLDD(A, m);
plasma->quark, &task_flags,
A(m, m, X1, X1, ldam), ldam, ldam*X,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for(n = 0; n < m; n++) {
Y1 = n == 0 ? A.j%A.nb : 0;
Y = A.nb - Y1;
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
plasma->quark, &task_flags,
A(m, n, X1, Y1, ldam), ldam, ldam*Y,
&(lwork[n*A.mb+1]), A.mb,
lwork, A.m);
}
}
/*
* PlasmaUpper
*/
else {
for(n = m+1; n < A.mt; n++) {
Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
&(lwork[m*A.mb+1]), A.mb,
lwork, A.m);
plasma->quark, &task_flags,
A(m, n, X1, 0, ldam), ldam, ldam*Y,
&(lwork[n*A.mb+1]), A.mb,
lwork, A.m);
}
}
}
plasma->quark, &task_flags,
PlasmaMaxNorm, A.m+1, 1,
lwork, 1, A.m+1,
0, result);
QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.m+1)*sizeof(PLASMA_Complex32_t));
break;
/*
* PlasmaFrobeniusNorm - not implemented
*/
default:;
}
}

Here is the call graph for this function:

void plasma_pclaset2_quark ( PLASMA_enum  uplo,
PLASMA_Complex32_t  alpha,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel initializztion a 2-D array A to ALPHA on the offdiagonals.

Definition at line 22 of file pclaset2.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_claset2(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int i, j;
int ldai, ldaj;
int tempim;
int tempjm, tempjn;
int minmn = min(A.mt, A.nt);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if (uplo == PlasmaLower) {
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaLower, tempjm, tempjn, alpha,
A(j, j), ldaj);
for (i = j+1; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
}
else if (uplo == PlasmaUpper) {
for (j = 1; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
for (i = 0; i < min(j, A.mt); i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpper, tempjm, tempjn, alpha,
A(j, j), ldaj);
}
}
else {
for (i = 0; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
for (j = 0; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha,
A(i, j), ldai);
}
}
}
}

Here is the call graph for this function:

void plasma_pclaset_quark ( PLASMA_enum  uplo,
PLASMA_Complex32_t  alpha,
PLASMA_Complex32_t  beta,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel initialization a 2-D array A to BETA on the diagonal and ALPHA on the offdiagonals.

Definition at line 22 of file pclaset.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, min, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaLower, PlasmaUpper, PlasmaUpperLower, plasma_context_struct::quark, QUARK_CORE_claset(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int i, j;
int ldai, ldaj;
int tempim;
int tempjm, tempjn;
int minmn = min(A.mt, A.nt);
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if (uplo == PlasmaLower) {
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaLower, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
for (i = j+1; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
}
else if (uplo == PlasmaUpper) {
for (j = 1; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
for (i = 0; i < min(j, A.mt); i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpper, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
}
}
else {
for (i = 0; i < A.mt; i++){
tempim = i == A.mt-1 ? A.m-i*A.mb : A.mb;
ldai = BLKLDD(A, i);
for (j = 0; j < A.nt; j++){
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
plasma->quark, &task_flags,
PlasmaUpperLower, tempim, tempjn, alpha, alpha,
A(i, j), ldai);
}
}
for (j = 0; j < minmn; j++){
tempjm = j == A.mt-1 ? A.m-j*A.mb : A.mb;
tempjn = j == A.nt-1 ? A.n-j*A.nb : A.nb;
ldaj = BLKLDD(A, j);
plasma->quark, &task_flags,
PlasmaUpperLower, tempjm, tempjn, alpha, beta,
A(j, j), ldaj);
}
}
}

Here is the call graph for this function:

void plasma_pclaswp_quark ( PLASMA_desc  B,
int *  IPIV,
int  inc,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile row interchanges - dynamic scheduling

Definition at line 23 of file pclaswp.c.

References B, IPIV, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_desc_submatrix(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_claswp_ontile(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int tempi, tempm, tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if ( inc > 0 )
{
for (m = 0; m < B.mt; m++) {
tempi = m * B.mb;
tempm = B.m - tempi;
tempmm = m == B.mt-1 ? tempm : B.mb;
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
B(m, n), 1, tempmm, IPIV(m), inc, B(B.mt-1, n) );
}
}
}
else
{
for (m = B.mt-1; m > -1; m--) {
tempi = m * B.mb;
tempm = B.m - tempi;
tempmm = m == B.mt-1 ? tempm : B.mb;
for (n = 0; n < B.nt; n++) {
tempnn = n == B.nt-1 ? B.n - n * B.nb : B.nb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, tempi, n*B.nb, tempm, tempnn),
B(m, n), 1, tempmm, IPIV(m), inc, B(0, n) );
}
}
}
}

Here is the call graph for this function:

void plasma_pclaswpc_quark ( PLASMA_desc  B,
int *  IPIV,
int  inc,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile column interchanges - dynamic scheduling

Definition at line 23 of file pclaswpc.c.

References B, IPIV, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), plasma_desc_submatrix(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_claswpc_ontile(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int tempj, tempn, tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
if ( inc > 0 )
{
for (n = 0; n < B.nt; n++) {
tempj = n * B.nb;
tempn = B.n - tempj;
tempnn = n == B.nt-1 ? tempn : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m - m * B.mb : B.mb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, m*B.mb, tempj, tempmm, tempn),
B(m, n), 1, tempnn, IPIV(n), inc, B(m, B.nt-1) );
}
}
}
else
{
for (n = B.nt-1; n > -1; n--) {
tempj = n * B.nb;
tempn = B.n - tempj;
tempnn = n == B.nt-1 ? tempn : B.nb;
for (m = 0; m < B.mt; m++) {
tempmm = m == B.mt-1 ? B.m - m * B.mb : B.mb;
plasma->quark, &task_flags,
plasma_desc_submatrix(B, m*B.mb, tempj, tempmm, tempn),
B(m, n), 1, tempnn, IPIV(n), inc, B(m, 0) );
}
}
}
}

Here is the call graph for this function:

void plasma_pclauum_quark ( PLASMA_enum  uplo,
PLASMA_desc  A,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel UU' or L'L operation - dynamic scheduling

Definition at line 23 of file pclauum.c.

References A, BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, PlasmaConjTrans, PlasmaLeft, PlasmaLower, PlasmaNonUnit, PlasmaNoTrans, PlasmaRight, plasma_context_struct::quark, QUARK_CORE_cgemm(), QUARK_CORE_cherk(), QUARK_CORE_clauum(), QUARK_CORE_ctrmm(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int k, m, n;
int ldam;
int tempkm, tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for(n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
1.0, A(m, n), ldam,
1.0, A(n, n), A.mb);
for(k = n+1; k < m; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
plasma->quark, &task_flags,
tempkm, tempnn, tempmm, A.mb,
zone, A(m, k), ldam,
A(m, n), ldam,
zone, A(k, n), A.mb);
}
}
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempmm, tempnn, A.mb,
zone, A(m, m), ldam,
A(m, n), ldam);
}
plasma->quark, &task_flags,
tempmm,
A.mb, A(m, m), ldam);
}
}
/*
* PlasmaUpper
*/
else {
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
1.0, A(n, m), A.mb,
1.0, A(n, n), A.mb);
for (k = n+1; k < m; k++){
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
plasma->quark, &task_flags,
tempnn, tempkm, tempmm, A.mb,
zone, A(n, m), A.mb,
A(k, m), A.mb,
zone, A(n, k), A.mb);
}
}
for (n = 0; n < m; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempnn, tempmm, A.mb,
zone, A(m, m), ldam,
A(n, m), A.mb);
}
plasma->quark, &task_flags,
tempmm,
A.mb, A(m, m), ldam);
}
}
}

Here is the call graph for this function:

void plasma_pcpack ( plasma_context_t plasma)

plasma_pcpack pack all extra elements at the end of the matrix

 +&mdash;&mdash;&mdash;&mdash;&mdash;+
 |               |
 |               |
 |     A11       |
 |               |
 |               |
 +&mdash;&mdash;&mdash;&mdash;&mdash;+
 |     A21       |
 +&mdash;&mdash;&mdash;&mdash;&mdash;+

This matrix is initially stored as (example of Column Major, it's the same for row major. We just consider the transpose matrix) : A11(:,0), A21(:,0), A11(:,1), A21(:,1), ...

On exit, it's stored as follow. A11(:,:), A12(:,:)

Parameters:
[in]plasmaPlasma context
[in]mNumber of rows in matrix A
[in]nNumber of columns in matrix A
[in,out]AMatrix A to pack. (see above for entry and exit format)
[in]m0Number of rows of A21

Definition at line 65 of file pcpack.c.

References A, CORE_clacpy(), min, plasma_barrier(), plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_6, PlasmaComplexFloat, PlasmaUpperLower, plasma_sequence_t::status, and W.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n, m0;
int i, m1, size, rank, start, end, bs, mod;
plasma_unpack_args_6(m, n, A, m0, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
/* Quick return */
if ( n <= 1 )
return;
m1 = m - m0;
size = PLASMA_SIZE;
rank = PLASMA_RANK;
mod = (n-1) % size;
bs = (n-1) / size;
start = rank * bs;
if ( rank < mod ) {
bs++;
}
start += min( mod, rank );
/* Save leftover pieces that are otherwise going to be overwritten */
CORE_clacpy( PlasmaUpperLower, m0, bs, &(A[(int64_t)start*m+m1]), m, W, m0 );
/* Pack A */
end = ((n-1) / size) * size + 1;
for(i=rank+1; i<end; i+=size) {
memcpy( Wl, &(A[i*m]), m1*sizeof(PLASMA_Complex32_t));
plasma_barrier(plasma);
memcpy( &(A[i*m1]), Wl, m1*sizeof(PLASMA_Complex32_t));
}
if ( rank < (n - end)) {
i = end + rank;
memcpy( Wl, &(A[i*m]), m1*sizeof(PLASMA_Complex32_t));
plasma_barrier(plasma);
memcpy( &(A[i*m1]), Wl, m1*sizeof(PLASMA_Complex32_t));
}
else
plasma_barrier(plasma);
/* Restore leftover pieces */
CORE_clacpy( PlasmaUpperLower, m0, bs, W, m0, &(A[(int64_t)m1*n+start*m0]), m0 );
plasma_private_free(plasma, W);
plasma_private_free(plasma, Wl);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcplghe ( plasma_context_t plasma)

Parallel tile Cholesky factorization - static scheduling

Definition at line 21 of file pcplghe.c.

References A, BLKLDD, CORE_cplghe(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, and plasma_sequence_t::status.

{
float bump;
unsigned long long int seed;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam;
int tempmm, tempnn;
plasma_unpack_args_5(bump, A, seed, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt;
}
while ( n < A.nt ) {
next_n = n;
next_m = m;
next_m += PLASMA_SIZE;
while ( next_m >= A.mt && next_n < A.nt ) {
next_n++;
next_m = next_m - A.mt;
}
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
bump, tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcplghe_quark ( float  bump,
PLASMA_desc  A,
unsigned long long int  seed,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile Cholesky factorization - dynamic scheduling

Definition at line 72 of file pcplghe.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_cplghe(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int ldam;
int tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for (n = 0; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
bump, tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
}
}
}

Here is the call graph for this function:

void plasma_pcplgsy ( plasma_context_t plasma)

Parallel tile Cholesky factorization - static scheduling

Definition at line 21 of file pcplgsy.c.

References A, BLKLDD, CORE_cplgsy(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_5, and plasma_sequence_t::status.

{
unsigned long long int seed;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam;
int tempmm, tempnn;
plasma_unpack_args_5(bump, A, seed, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt;
}
while ( n < A.nt ) {
next_n = n;
next_m = m;
next_m += PLASMA_SIZE;
while ( next_m >= A.mt && next_n < A.nt ) {
next_n++;
next_m = next_m - A.mt;
}
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
bump, tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcplgsy_quark ( PLASMA_Complex32_t  bump,
PLASMA_desc  A,
unsigned long long int  seed,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile Cholesky factorization - dynamic scheduling

Definition at line 72 of file pcplgsy.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_cplgsy(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int ldam;
int tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for (n = 0; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
bump, tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
}
}
}

Here is the call graph for this function:

void plasma_pcplrnt ( plasma_context_t plasma)

Parallel tile Cholesky factorization - static scheduling

Definition at line 21 of file pcplrnt.c.

References A, BLKLDD, CORE_cplrnt(), plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, and plasma_sequence_t::status.

{
unsigned long long int seed;
PLASMA_sequence *sequence;
PLASMA_request *request;
int m, n;
int next_m;
int next_n;
int ldam;
int tempmm, tempnn;
plasma_unpack_args_4(A, seed, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
n = 0;
while (m >= A.mt) {
n++;
m = m - A.mt;
}
while ( n < A.nt ) {
next_n = n;
next_m = m;
next_m += PLASMA_SIZE;
while ( next_m >= A.mt && next_n < A.nt ) {
next_n++;
next_m = next_m - A.mt;
}
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
ldam = BLKLDD(A, m);
tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
m = next_m;
n = next_n;
}
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pcplrnt_quark ( PLASMA_desc  A,
unsigned long long int  seed,
PLASMA_sequence sequence,
PLASMA_request request 
)

Parallel tile Cholesky factorization - dynamic scheduling

Definition at line 71 of file pcplrnt.c.

References BLKLDD, plasma_desc_t::m, plasma_desc_t::mb, plasma_desc_t::mt, plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_cplrnt(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, and TASK_SEQUENCE.

{
int m, n;
int ldam;
int tempmm, tempnn;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
for (m = 0; m < A.mt; m++) {
tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
ldam = BLKLDD(A, m);
for (n = 0; n < A.nt; n++) {
tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
plasma->quark, &task_flags,
tempmm, tempnn, A(m, n), ldam,
A.m, m*A.mb, n*A.nb, seed );
}
}
}

Here is the call graph for this function:

void plasma_pcpotrf ( plasma_context_t plasma)

Parallel tile Cholesky factorization - static scheduling

Definition at line 23 of file pcpotrf.c.

References A, BLKLDD, CORE_cgemm(), CORE_cherk(), CORE_cpotrf(), CORE_ctrsm(), plasma_desc_t::n, plasma_desc_t::nb, plasma_desc_t::nt, PLASMA_RANK, plasma_request_fail(), PLASMA_SIZE, PLASMA_SUCCESS, plasma_unpack_args_4, PlasmaConjTrans, PlasmaLeft, PlasmaLower, PlasmaNonUnit, PlasmaNoTrans, PlasmaRight, PlasmaUpper, ss_abort, ss_aborted, ss_cond_set, ss_cond_wait, ss_finalize, ss_init, plasma_sequence_t::status, and uplo.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
int k, m, n;
int next_k;
int next_m;
int next_n;
int ldak, ldam, ldan;
int info;
int tempkn, tempmn;
plasma_unpack_args_4(uplo, A, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
ss_init(A.nt, A.nt, 0);
k = 0;
while (m >= A.nt) {
k++;
m = m-A.nt+k;
}
n = 0;
while (k < A.nt && m < A.nt && !ss_aborted()) {
next_n = n;
next_m = m;
next_k = k;
next_n++;
if (next_n > next_k) {
next_m += PLASMA_SIZE;
while (next_m >= A.nt && next_k < A.nt) {
next_k++;
next_m = next_m-A.nt+next_k;
}
next_n = 0;
}
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
tempmn = m == A.nt-1 ? A.n-m*A.nb : A.nb;
ldak = BLKLDD(A, k);
ldan = BLKLDD(A, n);
ldam = BLKLDD(A, m);
if (m == k) {
if (n == k) {
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
tempkn,
A(k, k), ldak,
&info);
}
/*
* PlasmaUpper
*/
else {
tempkn,
A(k, k), ldak,
&info);
}
if (info != 0) {
plasma_request_fail(sequence, request, info + A.nb*k);
}
ss_cond_set(k, k, 1);
}
else {
ss_cond_wait(k, n, 1);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
tempkn, A.nb,
-1.0, A(k, n), ldak,
1.0, A(k, k), ldak);
}
/*
* PlasmaUpper
*/
else {
tempkn, A.nb,
-1.0, A(n, k), ldan,
1.0, A(k, k), ldak);
}
}
}
else {
if (n == k) {
ss_cond_wait(k, k, 1);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {
tempmn, A.nb,
zone, A(k, k), ldak,
A(m, k), ldam);
}
/*
* PlasmaUpper
*/
else {
A.nb, tempmn,
zone, A(k, k), ldak,
A(k, m), ldak);
}
ss_cond_set(m, k, 1);
}
else {
ss_cond_wait(k, n, 1);
ss_cond_wait(m, n, 1);
/*
* PlasmaLower
*/
if (uplo == PlasmaLower) {