001: /* ///////////////////////////////////////////////////////////////////////////////////////
002:  *  -- PLASMA --
003:  *     University of Tennessee
004:  */
005: #include "common.h"
006: 
007: #include <stdio.h>
008: #include <math.h>
009: #include <cblas.h>
010: #include "../src/lapack.h"
011: 
012: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
013: /* ///                    PLASMA computational routine (version 2.0.0)                       ///
014:  * ///                    Release Date: July, 4th 2009                                       ///
015:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
016:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
017: 
018: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
019: // PLASMA_zcgesv - Computes the solution to a system of linear equations A * X = B,
020: // where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
021: // The tile LU decomposition with partial tile pivoting and row interchanges is used to factor A.
022: // The factored form of A is then used to solve the system of equations A * X = B.
023: // All matrices are passed through descriptors. All dimensions are taken from the descriptors.
024: //
025: // IMPORTANT NOTICE: in its current state, this routine only intends to be a proof-of-concept. 
026: // There are still some costly serial parts and one may NOT expect to achieve high performance.
027: 
028: // PLASMA_zcgesv first attempts to factorize the matrix in COMPLEX and use this
029: // factorization within an iterative refinement procedure to produce a
030: // solution with COMPLEX*16 normwise backward error quality (see below).
031: // If the approach fails the method switches to a COMPLEX*16
032: // factorization and solve.
033: //
034: // The iterative refinement is not going to be a winning strategy if
035: // the ratio COMPLEX performance over COMPLEX*16 performance is too
036: // small. A reasonable strategy should take the number of right-hand
037: // sides and the size of the matrix into account. This might be done
038: // with a call to ILAENV in the future. Up to now, we always try
039: // iterative refinement.
040: //
041: // The iterative refinement process is stopped if ITER > ITERMAX or
042: // for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
043: // where
044: // o ITER is the number of the current iteration in the
045: //   iterative refinement process
046: // o RNRM is the infinity-norm of the residual
047: // o XNRM is the infinity-norm of the solution
048: // o ANRM is the infinity-operator-norm of the matrix A
049: // o EPS is the machine epsilon returned by DLAMCH('Epsilon').
050: // The values ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
051: 
052: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
053: // N        int (IN)
054: //          The number of linear equations, i.e., the order of the matrix A. N >= 0.
055: //
056: // NRHS     int (IN)
057: //          The number of right hand sides, i.e., the number of columns of the matrix B.
058: //          NRHS >= 0.
059: //
060: // A        PLASMA_Complex64_t* (INOUT)
061: //          On entry, the N-by-N coefficient matrix A.
062: //          On exit, the tile L and U factors from the factorization (not equivalent to LAPACK).
063: //
064: // LDA      int (IN)
065: //          The leading dimension of the array A. LDA >= max(1,N).
066: //
067: // L        PLASMA_Complex64_t* (OUT)
068: //          On exit, auxiliary factorization data, related to the tile L factor,
069: //          necessary to solve the system of equations.
070: //
071: // IPIV     int* (OUT)
072: //          On exit, the pivot indices that define the permutations (not equivalent to LAPACK).
073: //
074: // B        PLASMA_Complex64_t* (IN)
075: //          The N-by-NRHS matrix of right hand side matrix B.
076: //
077: // LDB      int (IN)
078: //          The leading dimension of the array B. LDB >= max(1,N).
079: //
080: // X        PLASMA_Complex64_t* (OUT)
081: //          If return value = 0, the N-by-NRHS solution matrix X.
082: //
083: // LDX      int (IN)
084: //          The leading dimension of the array B. LDX >= max(1,N).
085: //
086: // ITER     int* (OUT)is the number of the current iteration in the iterative refinement process
087: 
088: 
089: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
090: //          = 0: successful exit
091: //          < 0: if -i, the i-th argument had an illegal value
092: //          > 0: if i, U(i,i) is exactly zero. The factorization has been completed,
093: //               but the factor U is exactly singular, so the solution could not be computed.
094: 
095: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
096: 
097:   int PLASMA_zcgesv_Tile(PLASMA_desc *Atile, PLASMA_desc *Ltile, int *IPIV, PLASMA_desc *Btile, PLASMA_desc *Xtile, int *ITER)
098: {
099: 
100:     PLASMA_desc descA = *Atile;
101:     PLASMA_desc descL = *Ltile;
102:     PLASMA_desc descX = *Xtile;
103:     PLASMA_desc descB = *Btile;
104: 
105:     int N, LDA, LDB, LDX, NRHS, NTRHS, NB, NT;
106:     N=LDA=LDB=LDX=descA.n;
107:     NRHS=descX.n;
108: 
109:     PLASMA_Complex32_t *SAbdl;
110:     PLASMA_Complex32_t *SXbdl;
111:     PLASMA_Complex32_t *SLbdl;
112:     PLASMA_Complex64_t *work;
113:     PLASMA_Complex32_t *swork;
114:     double *rwork;
115:     PLASMA_Complex64_t *Abdl;
116:     PLASMA_Complex64_t *Xbdl;
117:     PLASMA_Complex64_t *Lbdl;
118:     plasma_context_t *plasma;
119: 
120:     const int itermax = 30;
121:     const double bwdmax = 1.0;
122:     const PLASMA_Complex64_t negone = -1.0;
123:     const PLASMA_Complex64_t one = 1.0;
124: 
125:     char norm='I';
126:     char all='A';
127: 
128:     int info;
129:     int i, iiter, ptsa, ptsx;
130:     double Anorm, cte, eps, Rnorm, Xnorm;
131: 
132:     *ITER=0;
133: 
134:     plasma = plasma_context_self();
135: 
136:     if (plasma == NULL) {
137:         plasma_fatal_error("PLASMA_dgesv_Tile", "PLASMA not initialized");
138:         return PLASMA_ERR_NOT_INITIALIZED;
139:     }
140:     /* Check descriptors for correctness */
141:     if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
142:         plasma_error("PLASMA_dgesv_Tile", "invalid first descriptor");
143:         return PLASMA_ERR_ILLEGAL_VALUE;
144:     }
145:     if (plasma_desc_check(&descL) != PLASMA_SUCCESS) {
146:         plasma_error("PLASMA_dgesv_Tile", "invalid second descriptor");
147:         return PLASMA_ERR_ILLEGAL_VALUE;
148:     }
149:     if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
150:         plasma_error("PLASMA_dgesv_Tile", "invalid third descriptor");
151:         return PLASMA_ERR_ILLEGAL_VALUE;
152:     }
153:     if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
154:         plasma_error("PLASMA_dgesv_Tile", "invalid fourth descriptor");
155:         return PLASMA_ERR_ILLEGAL_VALUE;
156:     }
157: 
158:     /* Set NT & NTRHS */
159:     NB = PLASMA_NB;
160:     NT = (N%NB==0) ? (N/NB) : (N/NB+1);
161:     NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
162: 
163:     /* Allocate memory for working arrays */
164:     work = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, N*NRHS, PlasmaComplexDouble);
165:     swork = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, N*(N+NRHS), PlasmaComplexFloat);
166:     rwork = (double *)plasma_shared_alloc(plasma, N, PlasmaRealDouble);
167:     /* Allocate memory for matrices in Lapack layout */
168:     PLASMA_Complex64_t *A = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexDouble);
169:     PLASMA_Complex64_t *L = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
170:     PLASMA_Complex64_t *B = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
171:     PLASMA_Complex64_t *X = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
172:     /* Allocate memory for matrices in block layout */
173:     SAbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexFloat);
174:     SLbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
175:     SXbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexFloat);
176:     if (work == NULL || swork == NULL || rwork == NULL || SAbdl == NULL || SLbdl == NULL || SXbdl == NULL || A == NULL || L == NULL || B == NULL || X == NULL) {
177:         plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
178:         plasma_shared_free(plasma, work);
179:         plasma_shared_free(plasma, swork);
180:         plasma_shared_free(plasma, rwork);
181:         plasma_shared_free(plasma, A);
182:         plasma_shared_free(plasma, L);
183:         plasma_shared_free(plasma, B);
184:         plasma_shared_free(plasma, X);
185:         plasma_shared_free(plasma, SAbdl);
186:         plasma_shared_free(plasma, SLbdl);
187:         plasma_shared_free(plasma, SXbdl);
188:         return PLASMA_ERR_OUT_OF_RESOURCES;
189:     }
190: 
191:     plasma_parallel_call_3(plasma_tile_to_lapack,
192:         PLASMA_desc, descA,
193:         PLASMA_Complex64_t*, A,
194:         int, LDA);
195: 
196:     plasma_parallel_call_3(plasma_tile_to_lapack,
197:         PLASMA_desc, descB,
198:         PLASMA_Complex64_t*, B,
199:         int, LDB);
200: 
201:     /* Compute some constants */
202:     Anorm = zlange(&norm, &N, &N, A, &LDA, rwork);
203:     eps = dlamch("Epsilon");
204:     cte = Anorm*eps*sqrt((double) N)*bwdmax;
205: 
206:     /* Set the indices ptsa, ptsx for referencing sa and sx in swork. */
207: 
208:     ptsa = 0;
209:     ptsx = ptsa + N*N;
210: 
211:     /* Convert B from double precision to single precision and store
212:        the result in SX. */
213: 
214:     zlag2c(&N, &NRHS, B, &LDB, swork+ptsx, &N, &info);
215: 
216:     /* Convert A from double precision to single precision and store
217:        the result in SA. */
218: 
219:     zlag2c(&N, &N, A, &LDA, swork+ptsa, &N, &info);
220: 
221:     PLASMA_desc descSA = plasma_desc_init(
222:         SAbdl, PlasmaComplexFloat,
223:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
224:         N, N, 0, 0, N, N);
225: 
226:     PLASMA_desc descSX = plasma_desc_init(
227:         SXbdl, PlasmaComplexFloat,
228:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
229:         N, NRHS, 0, 0, N, NRHS);
230: 
231:     PLASMA_desc descSL = plasma_desc_init(
232:         SLbdl, PlasmaComplexFloat,
233:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
234:         N, N, 0, 0, N, N);
235: 
236:     plasma_parallel_call_3(plasma_lapack_to_tile,
237:         PLASMA_Complex32_t*, swork+ptsa,
238:         int, N,
239:         PLASMA_desc, descSA);
240: 
241:     plasma_parallel_call_3(plasma_lapack_to_tile,
242:         PLASMA_Complex32_t*, swork+ptsx,
243:         int, N,
244:         PLASMA_desc, descSX);
245: 
246:     /* Clear IPIV and Lbdl */
247:     plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
248:     plasma_memzero(SLbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
249: 
250:     /* Set INFO to ZERO */
251:     PLASMA_INFO = PLASMA_SUCCESS;
252: 
253:     /* Compute the LU factorization of SA */
254:     plasma_parallel_call_3(plasma_pcgetrf,
255:         PLASMA_desc, descSA,
256:         PLASMA_desc, descSL,
257:         int*, IPIV);
258: 
259:     if (PLASMA_INFO == PLASMA_SUCCESS)
260:     {
261:         /* Solve the system SA*SX = SB */
262: 
263:         /* Forward substitution */
264:         plasma_parallel_call_4(plasma_pctrsmpl,
265:             PLASMA_desc, descSA,
266:             PLASMA_desc, descSX,
267:             PLASMA_desc, descSL,
268:             int*, IPIV);
269: 
270:         /* Backward substitution */
271:         plasma_parallel_call_7(plasma_pctrsm,
272:             PLASMA_enum, PlasmaLeft,
273:             PLASMA_enum, PlasmaUpper,
274:             PLASMA_enum, PlasmaNoTrans,
275:             PLASMA_enum, PlasmaNonUnit,
276:             PLASMA_Complex32_t, 1.0,
277:             PLASMA_desc, descSA,
278:             PLASMA_desc, descSX);
279: 
280:         /* Come back to lapack layout for A */
281:         plasma_parallel_call_3(plasma_tile_to_lapack,
282:             PLASMA_desc, descSA,
283:             PLASMA_Complex32_t*, swork+ptsa,
284:             int, N);
285: 
286:         /* Come back to lapack layout for X */
287:         plasma_parallel_call_3(plasma_tile_to_lapack,
288:             PLASMA_desc, descSX,
289:             PLASMA_Complex32_t*, swork+ptsx,
290:             int, N);
291:     }
292: 
293:     /* Convert SX back to double precision */
294:     clag2z( &N, &NRHS, swork+ptsx, &N, X, &LDX, &info );
295: 
296:     /* Compute R = B - AX (R is WORK). */
297:     zlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
298:     cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, CBLAS_SADDR(negone), A, LDA, X, LDX, CBLAS_SADDR(one), work, N);
299: 
300:     /* Check whether the NRHS normwise backward errors satisfy the
301:        stopping criterion. If yes return. Note that ITER=0 (already set). */
302:     Xnorm = zlange(&norm, &N, &NRHS, X, &LDX, rwork);
303:     Rnorm = zlange(&norm, &N, &NRHS, work, &N, rwork);
304:     if (Rnorm < Xnorm * cte){
305:       /* The NRHS normwise backward errors satisfy the
306:          stopping criterion. We are good to exit. */
307:       plasma_parallel_call_3(plasma_lapack_to_tile,
308:           PLASMA_Complex64_t*, X,
309:           int, N,
310:           PLASMA_desc, descX);
311:       return PLASMA_INFO;;
312:     }
313: 
314:     for (iiter = 0; iiter < itermax; iiter++){
315: 
316:       /* Convert R (in WORK) from double precision to single precision
317:          and store the result in SX. */
318:       zlag2c( &N, &NRHS, work, &N, swork+ptsx, &N, &info );
319: 
320:       /* Set X to bdl layout again */
321:       plasma_parallel_call_3(plasma_lapack_to_tile,
322:         PLASMA_Complex32_t*, swork+ptsx,
323:         int, N,
324:         PLASMA_desc, descSX);
325: 
326:       /* Solve the system SA*SX = SB */
327: 
328:       /* Forward substitution */
329:       plasma_parallel_call_4(plasma_pctrsmpl,
330:             PLASMA_desc, descSA,
331:             PLASMA_desc, descSX,
332:             PLASMA_desc, descSL,
333:             int*, IPIV);
334: 
335:       /* Backward substitution */
336:       plasma_parallel_call_7(plasma_pctrsm,
337:             PLASMA_enum, PlasmaLeft,
338:             PLASMA_enum, PlasmaUpper,
339:             PLASMA_enum, PlasmaNoTrans,
340:             PLASMA_enum, PlasmaNonUnit,
341:             PLASMA_Complex32_t, 1.0,
342:             PLASMA_desc, descSA,
343:             PLASMA_desc, descSX);
344: 
345:       /* Come back to lapack layout for X */
346:       plasma_parallel_call_3(plasma_tile_to_lapack,
347:             PLASMA_desc, descSX,
348:             PLASMA_Complex32_t*, swork+ptsx,
349:             int, N);
350: 
351:       /* Convert SX back to double precision and update the current
352:          iterate. */
353:       clag2z( &N, &NRHS, swork+ptsx, &N, work, &N, &info );
354: 
355:       for (i = 0; i < NRHS; i++){
356:         cblas_zaxpy(N, CBLAS_SADDR(one), work+i*N, 1, X+i*LDX, 1);
357:       }
358: 
359:       /* Compute R = B - AX (R is WORK). */
360:       zlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
361:       cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, CBLAS_SADDR(negone), A, LDA, X, LDX, CBLAS_SADDR(one), work, N);
362: 
363:       /* Check whether the NRHS normwise backward errors satisfy the
364:          stopping criterion. If yes, set ITER=IITER>0 and return. */
365:       Xnorm = zlange(&norm, &N, &NRHS, X, &LDX, rwork);
366:       Rnorm = zlange(&norm, &N, &NRHS, work, &N, rwork);
367:       if (Rnorm < Xnorm * cte){
368:         /* The NRHS normwise backward errors satisfy the
369:            stopping criterion. We are good to exit. */
370:         *ITER = iiter;
371:         plasma_parallel_call_3(plasma_lapack_to_tile,
372:             PLASMA_Complex64_t*, X,
373:             int, N,
374:             PLASMA_desc, descX);
375:         plasma_shared_free(plasma, SAbdl);
376:         plasma_shared_free(plasma, SLbdl);
377:         plasma_shared_free(plasma, SXbdl);
378:         plasma_shared_free(plasma, work);
379:         plasma_shared_free(plasma, swork);
380:         plasma_shared_free(plasma, rwork);
381:         return PLASMA_INFO;
382:       }
383: 
384:     }
385: 
386:     /* We have performed ITER=itermax iterations and never satisified
387:        the stopping criterion, set up the ITER flag accordingly and
388:        follow up on double precision routine. */
389: 
390:     *ITER = -itermax - 1;
391: 
392:     plasma_shared_free(plasma, SAbdl);
393:     plasma_shared_free(plasma, SLbdl);
394:     plasma_shared_free(plasma, SXbdl);
395:     plasma_shared_free(plasma, work);
396:     plasma_shared_free(plasma, swork);
397:     plasma_shared_free(plasma, rwork);
398: 
399:     /* Single-precision iterative refinement failed to converge to a
400:        satisfactory solution, so we resort to double precision. */
401: 
402:     /* Allocate memory for matrices in block layout */
403:     Abdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexDouble);
404:     Lbdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
405:     Xbdl = (PLASMA_Complex64_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexDouble);
406:     if (Abdl == NULL || Lbdl == NULL || Xbdl == NULL) {
407:         plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
408:         plasma_shared_free(plasma, Abdl);
409:         plasma_shared_free(plasma, Lbdl);
410:         plasma_shared_free(plasma, Xbdl);
411:         return PLASMA_ERR_OUT_OF_RESOURCES;
412:     }
413: 
414:     zlacpy( &all, &N, &NRHS, B, &LDB, X, &LDX );
415: 
416:     descA = plasma_desc_init(
417:         Abdl, PlasmaComplexDouble,
418:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
419:         N, N, 0, 0, N, N);
420: 
421:     descX = plasma_desc_init(
422:         Xbdl, PlasmaComplexDouble,
423:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
424:         N, NRHS, 0, 0, N, NRHS);
425: 
426:     descL = plasma_desc_init(
427:         Lbdl, PlasmaComplexDouble,
428:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
429:         N, N, 0, 0, N, N);
430: 
431:     plasma_parallel_call_3(plasma_lapack_to_tile,
432:         PLASMA_Complex64_t*, A,
433:         int, LDA,
434:         PLASMA_desc, descA);
435: 
436:     plasma_parallel_call_3(plasma_lapack_to_tile,
437:         PLASMA_Complex64_t*, X,
438:         int, LDX,
439:         PLASMA_desc, descX);
440: 
441:     /* Clear IPIV and Lbdl */
442:     plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
443:     plasma_memzero(Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
444: 
445:     /* Set INFO to ZERO */
446:     PLASMA_INFO = PLASMA_SUCCESS;
447: 
448:     plasma_parallel_call_3(plasma_pzgetrf,
449:         PLASMA_desc, descA,
450:         PLASMA_desc, descL,
451:         int*, IPIV);
452: 
453:     /* Return L to the user */
454:     plasma_memcpy(L, Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexDouble);
455: 
456:     if (PLASMA_INFO == PLASMA_SUCCESS)
457:     {
458:         plasma_parallel_call_4(plasma_pztrsmpl,
459:             PLASMA_desc, descA,
460:             PLASMA_desc, descX,
461:             PLASMA_desc, descL,
462:             int*, IPIV);
463: 
464:         plasma_parallel_call_7(plasma_pztrsm,
465:             PLASMA_enum, PlasmaLeft,
466:             PLASMA_enum, PlasmaUpper,
467:             PLASMA_enum, PlasmaNoTrans,
468:             PLASMA_enum, PlasmaNonUnit,
469:             PLASMA_Complex64_t, 1.0,
470:             PLASMA_desc, descA,
471:             PLASMA_desc, descX);
472: 
473:         plasma_parallel_call_3(plasma_tile_to_lapack,
474:             PLASMA_desc, descA,
475:             PLASMA_Complex64_t*, A,
476:             int, LDA);
477: 
478:         plasma_parallel_call_3(plasma_tile_to_lapack,
479:             PLASMA_desc, descX,
480:             PLASMA_Complex64_t*, X,
481:             int, LDX);
482:     }
483: 
484:     return PLASMA_INFO;
485: 
486: }