001: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
002: /* ///                    PLASMA computational routines (version 2.1.0)                      ///
003:  * ///                    Author: Jakub Kurzak                                               ///
004:  * ///                    Release Date: November, 15th 2009                                  ///
005:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
006:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
007: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
008: #include "common.h"
009: 
010: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
011: // PLASMA_cgesv - Computes the solution to a system of linear equations A * X = B,
012: // where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
013: // The tile LU decomposition with partial tile pivoting and row interchanges is used to factor A.
014: // The factored form of A is then used to solve the system of equations A * X = B.
015: 
016: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
017: // N        int (IN)
018: //          The number of linear equations, i.e., the order of the matrix A. N >= 0.
019: //
020: // NRHS     int (IN)
021: //          The number of right hand sides, i.e., the number of columns of the matrix B.
022: //          NRHS >= 0.
023: //
024: // A        PLASMA_Complex32_t* (INOUT)
025: //          On entry, the N-by-N coefficient matrix A.
026: //          On exit, the tile L and U factors from the factorization (not equivalent to LAPACK).
027: //
028: // LDA      int (IN)
029: //          The leading dimension of the array A. LDA >= max(1,N).
030: //
031: // L        PLASMA_Complex32_t* (OUT)
032: //          On exit, auxiliary factorization data, related to the tile L factor,
033: //          necessary to solve the system of equations.
034: //
035: // IPIV     int* (OUT)
036: //          On exit, the pivot indices that define the permutations (not equivalent to LAPACK).
037: //
038: // B        PLASMA_Complex32_t* (INOUT)
039: //          On entry, the N-by-NRHS matrix of right hand side matrix B.
040: //          On exit, if return value = 0, the N-by-NRHS solution matrix X.
041: //
042: // LDB      int (IN)
043: //          The leading dimension of the array B. LDB >= max(1,N).
044: 
045: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
046: //          = 0: successful exit
047: //          < 0: if -i, the i-th argument had an illegal value
048: //          > 0: if i, U(i,i) is exactly zero. The factorization has been completed,
049: //               but the factor U is exactly singular, so the solution could not be computed.
050: 
051: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
052: int PLASMA_cgesv(int N, int NRHS, PLASMA_Complex32_t *A, int LDA, PLASMA_Complex32_t *L, int *IPIV,
053:                  PLASMA_Complex32_t *B, int LDB)
054: {
055:     int NB, NT, NTRHS;
056:     int status;
057:     PLASMA_Complex32_t *Abdl;
058:     PLASMA_Complex32_t *Bbdl;
059:     PLASMA_Complex32_t *Lbdl;
060:     plasma_context_t *plasma;
061: 
062:     plasma = plasma_context_self();
063:     if (plasma == NULL) {
064:         plasma_error("PLASMA_cgesv", "PLASMA not initialized");
065:         return PLASMA_ERR_NOT_INITIALIZED;
066:     }
067:     /* Check input arguments */
068:     if (N < 0) {
069:         plasma_error("PLASMA_cgesv", "illegal value of N");
070:         return -1;
071:     }
072:     if (NRHS < 0) {
073:         plasma_error("PLASMA_cgesv", "illegal value of NRHS");
074:         return -2;
075:     }
076:     if (LDA < max(1, N)) {
077:         plasma_error("PLASMA_cgesv", "illegal value of LDA");
078:         return -4;
079:     }
080:     if (LDB < max(1, N)) {
081:         plasma_error("PLASMA_cgesv", "illegal value of LDB");
082:         return -8;
083:     }
084:     /* Quick return */
085:     if (min(N, NRHS) == 0)
086:         return PLASMA_SUCCESS;
087: 
088:     /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
089:     status = plasma_tune(PLASMA_FUNC_CGESV, N, N, NRHS);
090:     if (status != PLASMA_SUCCESS) {
091:         plasma_error("PLASMA_cgesv", "plasma_tune() failed");
092:         return status;
093:     }
094: 
095:     /* Set NT & NTRHS */
096:     NB = PLASMA_NB;
097:     NT = (N%NB==0) ? (N/NB) : (N/NB+1);
098:     NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
099: 
100:     /* Allocate memory for matrices in block layout */
101:     Abdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexFloat);
102:     Lbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
103:     Bbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexFloat);
104:     if (Abdl == NULL || Lbdl == NULL || Bbdl == NULL) {
105:         plasma_error("PLASMA_cgesv", "plasma_shared_alloc() failed");
106:         plasma_shared_free(plasma, Abdl);
107:         plasma_shared_free(plasma, Lbdl);
108:         plasma_shared_free(plasma, Bbdl);
109:         return PLASMA_ERR_OUT_OF_RESOURCES;
110:     }
111: 
112:     PLASMA_desc descA = plasma_desc_init(
113:         Abdl, PlasmaComplexFloat,
114:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
115:         N, N, 0, 0, N, N);
116: 
117:     PLASMA_desc descB = plasma_desc_init(
118:         Bbdl, PlasmaComplexFloat,
119:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
120:         N, NRHS, 0, 0, N, NRHS);
121: 
122:     PLASMA_desc descL = plasma_desc_init(
123:         Lbdl, PlasmaComplexFloat,
124:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
125:         N, N, 0, 0, N, N);
126: 
127:     plasma_parallel_call_3(plasma_lapack_to_tile,
128:         PLASMA_Complex32_t*, A,
129:         int, LDA,
130:         PLASMA_desc, descA);
131: 
132:     plasma_parallel_call_3(plasma_lapack_to_tile,
133:         PLASMA_Complex32_t*, B,
134:         int, LDB,
135:         PLASMA_desc, descB);
136: 
137:     /* Call the native interface */
138:     status = PLASMA_cgesv_Tile(&descA, &descL, IPIV, &descB);
139: 
140:     if (status == PLASMA_SUCCESS) {
141:         /* Return L to the user */
142:         plasma_memcpy(L, Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
143: 
144:         plasma_parallel_call_3(plasma_tile_to_lapack,
145:             PLASMA_desc, descA,
146:             PLASMA_Complex32_t*, A,
147:             int, LDA);
148: 
149:         plasma_parallel_call_3(plasma_tile_to_lapack,
150:             PLASMA_desc, descB,
151:             PLASMA_Complex32_t*, B,
152:             int, LDB);
153:     }
154:     plasma_shared_free(plasma, Abdl);
155:     plasma_shared_free(plasma, Lbdl);
156:     plasma_shared_free(plasma, Bbdl);
157:     return status;
158: }
159: 
160: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
161: // PLASMA_cgesv_Tile - Computes the solution to a system of linear equations A * X = B,
162: // where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
163: // The tile LU decomposition with partial tile pivoting and row interchanges is used to factor A.
164: // The factored form of A is then used to solve the system of equations A * X = B.
165: // All matrices are passed through descriptors. All dimensions are taken from the descriptors.
166: 
167: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
168: // A        PLASMA_Complex32_t* (INOUT)
169: //          On entry, the N-by-N coefficient matrix A.
170: //          On exit, the tile L and U factors from the factorization (not equivalent to LAPACK).
171: //
172: // L        PLASMA_Complex32_t* (OUT)
173: //          On exit, auxiliary factorization data, related to the tile L factor,
174: //          necessary to solve the system of equations.
175: //
176: // IPIV     int* (OUT)
177: //          On exit, the pivot indices that define the permutations (not equivalent to LAPACK).
178: //
179: // B        PLASMA_Complex32_t* (INOUT)
180: //          On entry, the N-by-NRHS matrix of right hand side matrix B.
181: //          On exit, if return value = 0, the N-by-NRHS solution matrix X.
182: 
183: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
184: //          = 0: successful exit
185: //          > 0: if i, U(i,i) is exactly zero. The factorization has been completed,
186: //               but the factor U is exactly singular, so the solution could not be computed.
187: 
188: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
189: int PLASMA_cgesv_Tile(PLASMA_desc *A, PLASMA_desc *L, int *IPIV, PLASMA_desc *B)
190: {
191:     PLASMA_desc descA = *A;
192:     PLASMA_desc descL = *L;
193:     PLASMA_desc descB = *B;
194:     plasma_context_t *plasma;
195: 
196:     plasma = plasma_context_self();
197:     if (plasma == NULL) {
198:         plasma_fatal_error("PLASMA_cgesv_Tile", "PLASMA not initialized");
199:         return PLASMA_ERR_NOT_INITIALIZED;
200:     }
201:     /* Check descriptors for correctness */
202:     if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
203:         plasma_error("PLASMA_cgesv_Tile", "invalid first descriptor");
204:         return PLASMA_ERR_ILLEGAL_VALUE;
205:     }
206:     if (plasma_desc_check(&descL) != PLASMA_SUCCESS) {
207:         plasma_error("PLASMA_cgesv_Tile", "invalid second descriptor");
208:         return PLASMA_ERR_ILLEGAL_VALUE;
209:     }
210:     if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
211:         plasma_error("PLASMA_cgesv_Tile", "invalid third descriptor");
212:         return PLASMA_ERR_ILLEGAL_VALUE;
213:     }
214:     /* Check input arguments */
215:     if (descA.nb != descA.mb || descB.nb != descB.mb) {
216:         plasma_error("PLASMA_cgesv_Tile", "only square tiles supported");
217:         return PLASMA_ERR_ILLEGAL_VALUE;
218:     }
219:     /* Quick return */
220: /*
221:     if (min(N, NRHS) == 0)
222:         return PLASMA_SUCCESS;
223: */
224:     /* Clear IPIV and Lbdl */
225:     plasma_memzero(IPIV, descA.mt*descA.nt*PLASMA_NB, PlasmaInteger);
226:     plasma_memzero(descL.mat, descL.mt*descL.nt*PLASMA_IBNBSIZE, PlasmaComplexFloat);
227: 
228:     /* Set INFO to ZERO */
229:     PLASMA_INFO = PLASMA_SUCCESS;
230: 
231:     plasma_parallel_call_3(plasma_pcgetrf,
232:         PLASMA_desc, descA,
233:         PLASMA_desc, descL,
234:         int*, IPIV);
235: 
236:     if (PLASMA_INFO == PLASMA_SUCCESS) {
237:         plasma_parallel_call_4(plasma_pctrsmpl,
238:             PLASMA_desc, descA,
239:             PLASMA_desc, descB,
240:             PLASMA_desc, descL,
241:             int*, IPIV);
242: 
243:         plasma_parallel_call_7(plasma_pctrsm,
244:             PLASMA_enum, PlasmaLeft,
245:             PLASMA_enum, PlasmaUpper,
246:             PLASMA_enum, PlasmaNoTrans,
247:             PLASMA_enum, PlasmaNonUnit,
248:             PLASMA_Complex32_t, 1.0,
249:             PLASMA_desc, descA,
250:             PLASMA_desc, descB);
251:     }
252:     return PLASMA_INFO;
253: }
254: