PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pdshift.c File Reference
#include <stdlib.h>
#include <sys/types.h>
#include <assert.h>
#include "common.h"
#include "primes.h"
#include "gkkleader.h"
Include dependency graph for pdshift.c:

Go to the source code of this file.

Functions

int plasma_dshift (plasma_context_t *plasma, int m, int n, double *A, int nprob, int me, int ne, int L, PLASMA_sequence *sequence, PLASMA_request *request)
void plasma_pdshift (plasma_context_t *plasma)
void plasma_pdshift_quark (int m, int n, int L, double *A, int *leaders, int nleaders, int nprob, PLASMA_sequence *sequence, PLASMA_request *request)

Detailed Description

PLASMA InPlaceTransformation module PLASMA is a software package provided by Univ. of Tennessee, Univ. of California Berkeley and Univ. of Colorado Denver

This work is the implementation of an inplace transformation based on the GKK algorithm by Gustavson, Karlsson, Kagstrom and its fortran implementation.

Version:
2.4.5
Author:
Mathieu Faverge
Date:
2010-11-15

d Tue Nov 22 14:35:42 2011

Definition in file pdshift.c.


Function Documentation

int plasma_dshift ( plasma_context_t plasma,
int  m,
int  n,
double *  A,
int  nprob,
int  me,
int  ne,
int  L,
PLASMA_sequence sequence,
PLASMA_request request 
)

plasma_dgetmi2 Implementation of inplace transposition based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. This algorithm shift some cycles to transpose the matrix.

Parameters:
[in]mNumber of rows of matrix A
[in]nNumber of columns of matrix A
[in,out]AMatrix of size L*m*n
[in]nprobNumber of parallel and independant problems
[in]meNumber of rows of the problem
[in]neNumber of columns in the problem
[in]LSize of chunk to use for transformation

Definition at line 60 of file pdshift.c.

References GKK_BalanceLoad(), GKK_getLeaderNbr(), L, minloc(), plasma_dynamic_call_9, PLASMA_ERR_ILLEGAL_VALUE, plasma_error(), PLASMA_GRPSIZE, plasma_pdshift(), plasma_request_fail(), PLASMA_SCHEDULING, plasma_shared_alloc(), plasma_shared_free(), PLASMA_SIZE, plasma_static_call_9, PLASMA_STATIC_SCHEDULING, PLASMA_SUCCESS, and PlasmaInteger.

{
int *leaders = NULL;
int ngrp, thrdbypb, thrdtot, nleaders;
/* Check Plasma context */
thrdtot = PLASMA_SIZE;
thrdbypb = PLASMA_GRPSIZE;
ngrp = thrdtot/thrdbypb;
/* check input */
if( (nprob * me * ne * L) != (m * n) ) {
plasma_error(__func__, "problem size does not match matrix size");
/*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( thrdbypb > thrdtot ) {
plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
if( (thrdtot % thrdbypb) != 0 ) {
plasma_error(__func__, "number of thread per problem must divide the total number of thread");
return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
}
/* quick return */
if( (me < 2) || (ne < 2) || (nprob < 1) ) {
}
GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
nleaders *= 3;
int *Tp = NULL;
int i, ipb;
int owner;
Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
ipb = 0;
/* First part with coarse parallelism */
if (nprob > ngrp) {
ipb = (nprob / ngrp)*ngrp;
/* loop over leader */
if (thrdbypb > 1) {
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdbypb, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=0; i< (nprob/ngrp); i++) {
int, me,
int, ne,
int, L,
double*, &(A[i*ngrp*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdbypb,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
/* Second part with fine parallelism */
if (ipb < nprob) {
for (i=0; i<thrdtot; i++)
Tp[i] = 0;
if (thrdtot > 1) {
/* loop over leader */
for (i=0; i<nleaders; i+=3) {
/* assign this cycle to a thread */
owner = minloc(thrdtot, Tp);
/* assign it to owner */
Tp[owner] = Tp[owner] + leaders[i+1] * L;
leaders[i+2] = owner;
}
GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
}
else {
for (i=0; i<nleaders; i+=3) {
Tp[0] = Tp[0] + leaders[i+1] * L;
leaders[i+2] = 0;
}
}
/* shift in parallel */
for (i=ipb; i<nprob; i++) {
int, me,
int, ne,
int, L,
double*, &(A[i*me*ne*L]),
int *, leaders,
int, nleaders,
int, thrdtot,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
}
plasma_shared_free(plasma, Tp);
}
/* Dynamic scheduling */
else {
int, me,
int, ne,
int, L,
double*, A,
int *, leaders,
int, nleaders,
int, nprob,
PLASMA_sequence*, sequence,
PLASMA_request*, request);
}
free(leaders);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdshift ( plasma_context_t plasma)

plasma_pdshift shifts a batch of cycles in parallel.

Parameters:
[in]plasmaPlasma context
[in]mNumber of rows of the panel to shift.
[in]nNumber of columns of the panel to shift
[in]LSize of each chunk to shift (Usually mb)
[in,out]A
[in]leaders
[in]nleaders
[in]thrdbypb

Definition at line 231 of file pdshift.c.

References A, CORE_dshiftw(), L, modpow(), plasma_barrier(), plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SUCCESS, plasma_unpack_args_9, PlasmaRealDouble, plasma_sequence_t::status, and W.

{
PLASMA_sequence *sequence;
PLASMA_request *request;
double *A, *Al, *W;
int locrnk, myrank;
int i, x, snix, cl, iprob;
int n, m, L, nleaders, thrdbypb;
int *leaders;
int64_t s, q;
plasma_unpack_args_9(m, n, L, A, leaders, nleaders, thrdbypb, sequence, request);
if (sequence->status != PLASMA_SUCCESS)
return;
myrank = PLASMA_RANK;
locrnk = myrank % thrdbypb;
iprob = myrank / thrdbypb;
q = m * n - 1;
Al = &(A[iprob*m*n*L]);
W = (double*)plasma_private_alloc(plasma, L, PlasmaRealDouble);
/* shift cycles in parallel. */
/* each thread shifts the cycles it owns. */
for(i=0; i<nleaders; i+=3) {
if( leaders[i+2] == locrnk ) {
/* cycle #i belongs to this thread, so shift it */
memcpy(W, &(Al[leaders[i]*L]), L*sizeof(double));
CORE_dshiftw(leaders[i], leaders[i+1], m, n, L, Al, W);
}
else if( leaders[i+2] == -2 ) {
/* cycle #i has been split, so shift in parallel */
x = leaders[i+1] / thrdbypb;
cl = x;
if( locrnk == 0 ) {
cl = leaders[i+1] - x * (thrdbypb - 1);
}
s = leaders[i];
snix = (s * modpow(n, locrnk*x, m * n - 1)) % q;
/* copy the block at s*n^(thid*x) (snix) */
memcpy(W, &(Al[snix*L]), L*sizeof(double));
/* wait for peers to finish copy their block. */
plasma_barrier(plasma);
/* shift the linear array. */
if( cl > 0 ) {
CORE_dshiftw(snix, cl, m, n, L, Al, W);
}
}
}
plasma_private_free(plasma, W);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdshift_quark ( int  m,
int  n,
int  L,
double *  A,
int *  leaders,
int  nleaders,
int  nprob,
PLASMA_sequence sequence,
PLASMA_request request 
)

Definition at line 289 of file pdshift.c.

References CORE_foo_quark(), INOUT, L, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dshift(), QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, TASK_SEQUENCE, TASKCOLOR, TASKLABEL, and VALUE.

{
double *Al;
int i, iprob, size;
plasma = plasma_context_self();
if (sequence->status != PLASMA_SUCCESS)
return;
QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
size = m*n*L;
for(iprob=0; iprob<nprob; iprob++) {
Al = &(A[iprob*size]);
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*size, Al, INOUT,
#ifdef TRACE_IPT
13, "Foo In shift", VALUE | TASKLABEL,
4, "red", VALUE | TASKCOLOR,
#endif
0);
/* shift cycles in parallel. */
for(i=0; i<nleaders; i+=3) {
//assert( leaders[i+2] != -2 );
QUARK_CORE_dshift(plasma->quark, &task_flags,
leaders[i], m, n, L, Al);
}
QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
sizeof(double)*size, Al, INOUT,
#ifdef TRACE_IPT
14, "Foo Out shift", VALUE | TASKLABEL,
4, "red", VALUE | TASKCOLOR,
#endif
0);
}
}

Here is the call graph for this function: