001: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
002: /* ///                    PLASMA computational routines (version 2.1.0)                      ///
003:  * ///                    Author: Jakub Kurzak                                               ///
004:  * ///                    Release Date: November, 15th 2009                                  ///
005:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
006:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
007: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
008: #include "common.h"
009: 
010: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
011: // PLASMA_strsmpl - Performs the forward substitution step of solving a system of linear equations
012: // after the tile LU factorization of the matrix.
013: 
014: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
015: // N        int (IN)
016: //          The order of the matrix A. N >= 0.
017: //
018: // NRHS     int (IN)
019: //          The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0.
020: //
021: // A        float* (IN)
022: //          The tile factor L from the factorization, computed by PLASMA_sgetrf.
023: //
024: // LDA      int (IN)
025: //          The leading dimension of the array A. LDA >= max(1,N).
026: //
027: // L        float* (IN)
028: //          Auxiliary factorization data, related to the tile L factor, computed by PLASMA_sgetrf.
029: //
030: // IPIV     int* (IN)
031: //          The pivot indices from PLASMA_sgetrf (not equivalent to LAPACK).
032: //
033: // B        float* (INOUT)
034: //          On entry, the N-by-NRHS right hand side matrix B.
035: //          On exit, if return value = 0, the N-by-NRHS solution matrix X.
036: //
037: // LDB      float* (IN)
038: //          The leading dimension of the array B. LDB >= max(1,N).
039: 
040: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
041: //          = 0: successful exit
042: //          < 0: if -i, the i-th argument had an illegal value
043: 
044: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
045: int PLASMA_strsmpl(int N, int NRHS, float *A, int LDA, float *L,
046:                    int *IPIV, float *B, int LDB)
047: {
048:     int NB, NT, NTRHS;
049:     int status;
050:     float *Abdl;
051:     float *Bbdl;
052:     float *Lbdl;
053:     plasma_context_t *plasma;
054: 
055:     plasma = plasma_context_self();
056:     if (plasma == NULL) {
057:         plasma_fatal_error("PLASMA_strsmpl", "PLASMA not initialized");
058:         return PLASMA_ERR_NOT_INITIALIZED;
059:     }
060:     /* Check input arguments */
061:     if (N < 0) {
062:         plasma_error("PLASMA_strsmpl", "illegal value of N");
063:         return -1;
064:     }
065:     if (NRHS < 0) {
066:         plasma_error("PLASMA_strsmpl", "illegal value of NRHS");
067:         return -2;
068:     }
069:     if (LDA < max(1, N)) {
070:         plasma_error("PLASMA_strsmpl", "illegal value of LDA");
071:         return -4;
072:     }
073:     if (LDB < max(1, N)) {
074:         plasma_error("PLASMA_strsmpl", "illegal value of LDB");
075:         return -8;
076:     }
077:     /* Quick return */
078:     if (min(N, NRHS) == 0)
079:         return PLASMA_SUCCESS;
080: 
081:     /* Tune NB & IB depending on N & NRHS; Set NBNBSIZE */
082:     status = plasma_tune(PLASMA_FUNC_SGESV, N, N, NRHS);
083:     if (status != PLASMA_SUCCESS) {
084:         plasma_error("PLASMA_strsmpl", "plasma_tune() failed");
085:         return status;
086:     }
087: 
088:     /* Set Mt, NT & NTRHS */
089:     NB = PLASMA_NB;
090:     NT = (N%NB==0) ? (N/NB) : (N/NB+1);
091:     NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
092: 
093:     /* Allocate memory for matrices in block layout */
094:     Abdl = (float *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaRealFloat);
095:     Lbdl = (float *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaRealFloat);
096:     Bbdl = (float *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealFloat);
097:     if (Abdl == NULL || Lbdl == NULL || Bbdl == NULL) {
098:         plasma_error("PLASMA_strsmpl", "plasma_shared_alloc() failed");
099:         plasma_shared_free(plasma, Abdl);
100:         plasma_shared_free(plasma, Lbdl);
101:         plasma_shared_free(plasma, Bbdl);
102:         return PLASMA_ERR_OUT_OF_RESOURCES;
103:     }
104: 
105:     PLASMA_desc descA = plasma_desc_init(
106:         Abdl, PlasmaRealFloat,
107:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
108:         N, N, 0, 0, N, N);
109: 
110:     PLASMA_desc descB = plasma_desc_init(
111:         Bbdl, PlasmaRealFloat,
112:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
113:         N, NRHS, 0, 0, N, NRHS);
114: 
115:     PLASMA_desc descL = plasma_desc_init(
116:         Lbdl, PlasmaRealFloat,
117:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
118:         N, N, 0, 0, N, N);
119: 
120:     plasma_parallel_call_3(plasma_lapack_to_tile,
121:         float*, A,
122:         int, LDA,
123:         PLASMA_desc, descA);
124: 
125:     plasma_parallel_call_3(plasma_lapack_to_tile,
126:         float*, B,
127:         int, LDB,
128:         PLASMA_desc, descB);
129: 
130:     /* Receive L from the user */
131:     plasma_memcpy(Lbdl, L, NT*NT*PLASMA_IBNBSIZE, PlasmaRealFloat);
132: 
133:     /* Call the native interface */
134:     status = PLASMA_strsmpl_Tile(&descA, &descL, IPIV, &descB);
135: 
136:     if (status == PLASMA_SUCCESS)
137:         plasma_parallel_call_3(plasma_tile_to_lapack,
138:             PLASMA_desc, descB,
139:             float*, B,
140:             int, LDB);
141: 
142:     plasma_shared_free(plasma, Abdl);
143:     plasma_shared_free(plasma, Lbdl);
144:     plasma_shared_free(plasma, Bbdl);
145:     return status;
146: }
147: 
148: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
149: // PLASMA_strsmpl_Tile - Performs the forward substitution step of solving a system of linear
150: // equations after the tile LU factorization of the matrix.
151: // All matrices are passed through descriptors. All dimensions are taken from the descriptors.
152: 
153: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
154: // A        float* (IN)
155: //          The tile factor L from the factorization, computed by PLASMA_sgetrf.
156: //
157: // L        float* (IN)
158: //          Auxiliary factorization data, related to the tile L factor, computed by PLASMA_sgetrf.
159: //
160: // IPIV     int* (IN)
161: //          The pivot indices from PLASMA_sgetrf (not equivalent to LAPACK).
162: //
163: // B        float* (INOUT)
164: //          On entry, the N-by-NRHS right hand side matrix B.
165: //          On exit, if return value = 0, the N-by-NRHS solution matrix X.
166: 
167: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
168: //          = 0: successful exit
169: 
170: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
171: int PLASMA_strsmpl_Tile(PLASMA_desc *A, PLASMA_desc *L, int *IPIV, PLASMA_desc *B)
172: {
173:     PLASMA_desc descA = *A;
174:     PLASMA_desc descL = *L;
175:     PLASMA_desc descB = *B;
176:     plasma_context_t *plasma;
177: 
178:     plasma = plasma_context_self();
179:     if (plasma == NULL) {
180:         plasma_fatal_error("PLASMA_strsmpl_Tile", "PLASMA not initialized");
181:         return PLASMA_ERR_NOT_INITIALIZED;
182:     }
183:     /* Check descriptors for correctness */
184:     if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
185:         plasma_error("PLASMA_strsmpl_Tile", "invalid first descriptor");
186:         return PLASMA_ERR_ILLEGAL_VALUE;
187:     }
188:     if (plasma_desc_check(&descL) != PLASMA_SUCCESS) {
189:         plasma_error("PLASMA_strsmpl_Tile", "invalid second descriptor");
190:         return PLASMA_ERR_ILLEGAL_VALUE;
191:     }
192:     if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
193:         plasma_error("PLASMA_strsmpl_Tile", "invalid third descriptor");
194:         return PLASMA_ERR_ILLEGAL_VALUE;
195:     }
196:     /* Check input arguments */
197:     if (descA.nb != descA.mb || descB.nb != descB.mb) {
198:         plasma_error("PLASMA_strsmpl_Tile", "only square tiles supported");
199:         return PLASMA_ERR_ILLEGAL_VALUE;
200:     }
201:     /* Quick return */
202: /*
203:     if (min(N, NRHS) == 0)
204:         return PLASMA_SUCCESS;
205: */
206:     plasma_parallel_call_4(plasma_pstrsmpl,
207:         PLASMA_desc, descA,
208:         PLASMA_desc, descB,
209:         PLASMA_desc, descL,
210:         int*, IPIV);
211: 
212:     return PLASMA_SUCCESS;
213: }
214: