#include <stdlib.h>
#include <sys/types.h>
#include <assert.h>
#include "common.h"
#include "primes.h"
#include "gkkleader.h"

Include dependency graph for pdshift.c:

Functions
int	plasma_dshift (plasma_context_t plasma, int m, int n, double A, int nprob, int me, int ne, int L, PLASMA_sequence sequence, PLASMA_request request)
void	plasma_pdshift (plasma_context_t *plasma)
void	plasma_pdshift_quark (int m, int n, int L, double A, int leaders, int nleaders, int nprob, PLASMA_sequence sequence, PLASMA_request request)

Detailed Description

PLASMA InPlaceTransformation module PLASMA is a software package provided by Univ. of Tennessee, Univ. of California Berkeley and Univ. of Colorado Denver

This work is the implementation of an inplace transformation based on the GKK algorithm by Gustavson, Karlsson, Kagstrom and its fortran implementation.

Version:: 2.4.5

Author:: Mathieu Faverge

Date:: 2010-11-15

d Tue Nov 22 14:35:42 2011

Definition in file pdshift.c.

Function Documentation

int plasma_dshift	(	plasma_context_t *	plasma,
		int	m,
		int	n,
		double *	A,
		int	nprob,
		int	me,
		int	ne,
		int	L,
		PLASMA_sequence *	sequence,
		PLASMA_request *	request
	)

plasma_dgetmi2 Implementation of inplace transposition based on the GKK algorithm by Gustavson, Karlsson, Kagstrom. This algorithm shift some cycles to transpose the matrix.

Parameters:

[in]	m	Number of rows of matrix A
[in]	n	Number of columns of matrix A
[in,out]	A	Matrix of size Lmn
[in]	nprob	Number of parallel and independant problems
[in]	me	Number of rows of the problem
[in]	ne	Number of columns in the problem
[in]	L	Size of chunk to use for transformation

Definition at line 60 of file pdshift.c.

References GKK_BalanceLoad(), GKK_getLeaderNbr(), L, minloc(), plasma_dynamic_call_9, PLASMA_ERR_ILLEGAL_VALUE, plasma_error(), PLASMA_GRPSIZE, plasma_pdshift(), plasma_request_fail(), PLASMA_SCHEDULING, plasma_shared_alloc(), plasma_shared_free(), PLASMA_SIZE, plasma_static_call_9, PLASMA_STATIC_SCHEDULING, PLASMA_SUCCESS, and PlasmaInteger.

{
    int *leaders = NULL;
    int ngrp, thrdbypb, thrdtot, nleaders;
    /* Check Plasma context */
    thrdtot  = PLASMA_SIZE;
    thrdbypb = PLASMA_GRPSIZE;
    ngrp = thrdtot/thrdbypb;
    /* check input */
    if( (nprob * me * ne * L) != (m * n) ) {
        plasma_error(__func__, "problem size does not match matrix size");
        /*printf("m=%d,  n=%d,  nprob=%d,  me=%d,  ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( thrdbypb > thrdtot ) {
        plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( (thrdtot % thrdbypb) != 0 ) {
        plasma_error(__func__, "number of thread per problem must divide the total number of thread");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    /* quick return */
    if( (me < 2) || (ne < 2) || (nprob < 1) ) {
        return PLASMA_SUCCESS;
    }
    GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
    nleaders *= 3;
    if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) {
        int *Tp      = NULL;
        int i, ipb;
        int owner;
        Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
        for (i=0; i<thrdtot; i++)
            Tp[i] = 0;
        ipb = 0;
        
        /* First part with coarse parallelism */
        if (nprob > ngrp) {
            ipb = (nprob / ngrp)*ngrp;
        
            /* loop over leader */
            if (thrdbypb > 1) {
                for (i=0; i<nleaders; i+=3) {
                    /* assign this cycle to a thread */
                    owner = minloc(thrdbypb, Tp);
                
                    /* assign it to owner */
                    Tp[owner] = Tp[owner] + leaders[i+1] * L;
                    leaders[i+2] = owner;
                }
            
                GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
            }
            else {
                for (i=0; i<nleaders; i+=3) {
                    Tp[0] = Tp[0] + leaders[i+1] * L;
                    leaders[i+2] = 0;
                }
            }
            /* shift in parallel */
            for (i=0; i< (nprob/ngrp); i++) {
                plasma_static_call_9(plasma_pdshift,
                                     int,                 me,
                                     int,                 ne,
                                     int,                 L,
                                     double*, &(A[i*ngrp*me*ne*L]),
                                     int *,               leaders,
                                     int,                 nleaders,
                                     int,                 thrdbypb,
                                     PLASMA_sequence*,    sequence,
                                     PLASMA_request*,     request);
            }
        }
    
        /* Second part with fine parallelism */
        if (ipb < nprob) {
            for (i=0; i<thrdtot; i++)
                Tp[i] = 0;
        
            if (thrdtot > 1) {
                /* loop over leader */
                for (i=0; i<nleaders; i+=3) {
                    /* assign this cycle to a thread */
                    owner = minloc(thrdtot, Tp);
                
                    /* assign it to owner */
                    Tp[owner] = Tp[owner] + leaders[i+1] * L;
                    leaders[i+2] = owner;
                }
                GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
            }
            else {
                for (i=0; i<nleaders; i+=3) {
                    Tp[0] = Tp[0] + leaders[i+1] * L;
                    leaders[i+2] = 0;
                }
            }
        
            /* shift in parallel */
            for (i=ipb; i<nprob; i++) {
                plasma_static_call_9(plasma_pdshift,
                                     int,                 me,
                                     int,                 ne,
                                     int,                 L,
                                     double*, &(A[i*me*ne*L]),
                                     int *,               leaders,
                                     int,                 nleaders,
                                     int,                 thrdtot,
                                     PLASMA_sequence*,    sequence,
                                     PLASMA_request*,     request);
            }
        }
        plasma_shared_free(plasma, Tp);
    }
    /* Dynamic scheduling */
    else {
        plasma_dynamic_call_9(plasma_pdshift,
                              int,                 me,
                              int,                 ne,
                              int,                 L,
                              double*, A,
                              int *,               leaders,
                              int,                 nleaders,
                              int,                 nprob,
                              PLASMA_sequence*,    sequence,
                              PLASMA_request*,     request);
    }
    free(leaders);
    return PLASMA_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdshift ( plasma_context_t * plasma )

plasma_pdshift shifts a batch of cycles in parallel.

Parameters:

[in]	plasma	Plasma context
[in]	m	Number of rows of the panel to shift.
[in]	n	Number of columns of the panel to shift
[in]	L	Size of each chunk to shift (Usually mb)
[in,out]	A
[in]	leaders
[in]	nleaders
[in]	thrdbypb

Definition at line 231 of file pdshift.c.

References A, CORE_dshiftw(), L, modpow(), plasma_barrier(), plasma_private_alloc(), plasma_private_free(), PLASMA_RANK, PLASMA_SUCCESS, plasma_unpack_args_9, PlasmaRealDouble, plasma_sequence_t::status, and W.

                                              {
    PLASMA_sequence *sequence;
    PLASMA_request *request;
    double *A, *Al, *W;
    int     locrnk, myrank;
    int     i, x, snix, cl, iprob;
    int     n, m, L, nleaders, thrdbypb;
    int    *leaders;
    int64_t s, q;
    
    plasma_unpack_args_9(m, n, L, A, leaders, nleaders, thrdbypb, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;
    myrank   = PLASMA_RANK;
    locrnk   = myrank % thrdbypb;
    iprob    = myrank / thrdbypb;
    q  = m * n - 1;
    Al = &(A[iprob*m*n*L]);
    
    W = (double*)plasma_private_alloc(plasma, L, PlasmaRealDouble);
    /* shift cycles in parallel. */
    /* each thread shifts the cycles it owns. */
    for(i=0; i<nleaders; i+=3) {
        if( leaders[i+2] == locrnk ) {
            /* cycle #i belongs to this thread, so shift it */
            memcpy(W, &(Al[leaders[i]*L]), L*sizeof(double));
            CORE_dshiftw(leaders[i], leaders[i+1], m, n, L, Al, W);
        }
        else if( leaders[i+2] == -2 ) {
            /* cycle #i has been split, so shift in parallel */
            x  = leaders[i+1] / thrdbypb;
            cl = x;
            if( locrnk == 0 ) {
                cl = leaders[i+1] - x * (thrdbypb - 1);
            }
            s    = leaders[i];
            snix = (s * modpow(n, locrnk*x, m * n - 1)) % q;
            
            /* copy the block at s*n^(thid*x) (snix) */
            memcpy(W, &(Al[snix*L]), L*sizeof(double));
            /* wait for peers to finish copy their block. */
            plasma_barrier(plasma);
            /* shift the linear array. */
            if( cl > 0 ) {
                CORE_dshiftw(snix, cl, m, n, L, Al, W);
            }
        }
    }
    plasma_private_free(plasma, W);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void plasma_pdshift_quark	(	int	m,
		int	n,
		int	L,
		double *	A,
		int *	leaders,
		int	nleaders,
		int	nprob,
		PLASMA_sequence *	sequence,
		PLASMA_request *	request
	)

Definition at line 289 of file pdshift.c.

References CORE_foo_quark(), INOUT, L, plasma_context_self(), PLASMA_SUCCESS, plasma_context_struct::quark, QUARK_CORE_dshift(), QUARK_Insert_Task(), plasma_sequence_t::quark_sequence, QUARK_Task_Flag_Set(), Quark_Task_Flags_Initializer, plasma_sequence_t::status, TASK_SEQUENCE, TASKCOLOR, TASKLABEL, and VALUE.

{
    plasma_context_t   *plasma;
    Quark_Task_Flags    task_flags = Quark_Task_Flags_Initializer;
    double *Al;
    int     i, iprob, size;
    
    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;
    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
    size = m*n*L;
    for(iprob=0; iprob<nprob; iprob++) {
        Al = &(A[iprob*size]);
        QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
                          sizeof(double)*size, Al,  INOUT,
#ifdef TRACE_IPT
                          13, "Foo In shift",   VALUE | TASKLABEL,
                          4, "red",  VALUE | TASKCOLOR,
#endif
                          0);
        /* shift cycles in parallel. */
        for(i=0; i<nleaders; i+=3) {
            //assert( leaders[i+2] != -2 );
            QUARK_CORE_dshift(plasma->quark, &task_flags,
                              leaders[i], m, n, L, Al);
        }
        QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
                          sizeof(double)*size, Al,  INOUT,
#ifdef TRACE_IPT
                          14, "Foo Out shift",   VALUE | TASKLABEL,
                          4, "red",  VALUE | TASKCOLOR,
#endif
                          0);
    }
}

Here is the call graph for this function:

Functions

Detailed Description

Function Documentation