001: /* ///////////////////////////// P /// L /// A /// S /// M /// A /////////////////////////////// */
002: /* ///                    PLASMA computational routines (version 2.1.0)                      ///
003:  * ///                    Author: Jakub Kurzak                                               ///
004:  * ///                    Release Date: November, 15th 2009                                  ///
005:  * ///                    PLASMA is a software package provided by Univ. of Tennessee,       ///
006:  * ///                    Univ. of California Berkeley and Univ. of Colorado Denver          /// */
007: /* ///////////////////////////////////////////////////////////////////////////////////////////// */
008: #include "common.h"
009: 
010: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
011: // PLASMA_dgels - solves overdetermined or underdetermined linear systems involving an M-by-N
012: // matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
013: // The following options are provided:
014: //
015: // 1. trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
016: //    system, i.e., solve the least squares problem: minimize || B - A*X ||.
017: //
018: // 2. trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
019: //    system A * X = B.
020: //
021: // Several right hand side vectors B and solution vectors X can be handled in a single call;
022: // they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
023: // solution
024: // matrix X.
025: 
026: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
027: // trans    PLASMA_enum (IN)
028: //          Intended usage:
029: //          = PlasmaNoTrans:   the linear system involves A;
030: //          = PlasmaTrans: the linear system involves A**T.
031: //          Currently only PlasmaNoTrans is supported.
032: //
033: // M        int (IN)
034: //          The number of rows of the matrix A. M >= 0.
035: //
036: // N        int (IN)
037: //          The number of columns of the matrix A. N >= 0.
038: //
039: // NRHS     int (IN)
040: //          The number of right hand sides, i.e., the number of columns of the matrices B and X.
041: //          NRHS >= 0.
042: //
043: // A        double* (INOUT)
044: //          On entry, the M-by-N matrix A.
045: //          On exit,
046: //          if M >= N, A is overwritten by details of its QR factorization as returned by
047: //                     PLASMA_dgeqrf;
048: //          if M < N, A is overwritten by details of its LQ factorization as returned by
049: //                      PLASMA_dgelqf.
050: //
051: // LDA      int (IN)
052: //          The leading dimension of the array A. LDA >= max(1,M).
053: //
054: // T        double* (OUT)
055: //          On exit, auxiliary factorization data.
056: //
057: // B        double* (INOUT)
058: //          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
059: //          On exit, if return value = 0, B is overwritten by the solution vectors, stored
060: //          columnwise:
061: //          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
062: //          sum of squares for the solution in each column is given by the sum of squares of the
063: //          modulus of elements N+1 to M in that column;
064: //          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
065: //
066: // LDB      int (IN)
067: //          The leading dimension of the array B. LDB >= MAX(1,M,N).
068: 
069: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
070: //          = 0: successful exit
071: //          < 0: if -i, the i-th argument had an illegal value
072: 
073: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
074: int PLASMA_dgels(PLASMA_enum trans, int M, int N, int NRHS, double *A, int LDA,
075:                  double *T, double *B, int LDB)
076: {
077:     int i, j;
078:     int NB, MT, NT, NTRHS;
079:     int status;
080:     double *Abdl;
081:     double *Bbdl;
082:     double *Tbdl;
083:     plasma_context_t *plasma;
084:     PLASMA_desc descA;
085:     PLASMA_desc descB;
086:     PLASMA_desc descT;
087: 
088:     plasma = plasma_context_self();
089:     if (plasma == NULL) {
090:         plasma_fatal_error("PLASMA_dgels", "PLASMA not initialized");
091:         return PLASMA_ERR_NOT_INITIALIZED;
092:     }
093:     /* Check input arguments */
094:     if (trans != PlasmaNoTrans) {
095:         plasma_error("PLASMA_dgels", "only PlasmaNoTrans supported");
096:         return PLASMA_ERR_NOT_SUPPORTED;
097:     }
098:     if (M < 0) {
099:         plasma_error("PLASMA_dgels", "illegal value of M");
100:         return -2;
101:     }
102:     if (N < 0) {
103:         plasma_error("PLASMA_dgels", "illegal value of N");
104:         return -3;
105:     }
106:     if (NRHS < 0) {
107:         plasma_error("PLASMA_dgels", "illegal value of NRHS");
108:         return -4;
109:     }
110:     if (LDA < max(1, M)) {
111:         plasma_error("PLASMA_dgels", "illegal value of LDA");
112:         return -6;
113:     }
114:     if (LDB < max(1, max(M, N))) {
115:         plasma_error("PLASMA_dgels", "illegal value of LDB");
116:         return -9;
117:     }
118:     /* Quick return */
119:     if (min(M, min(N, NRHS)) == 0) {
120:         for (i = 0; i < max(M, N); i++)
121:             for (j = 0; j < NRHS; j++)
122:                 B[j*LDB+i] = 0.0;
123:         return PLASMA_SUCCESS;
124:     }
125: 
126:     /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
127:     status = plasma_tune(PLASMA_FUNC_DGELS, M, N, NRHS);
128:     if (status != PLASMA_SUCCESS) {
129:         plasma_error("PLASMA_dgels", "plasma_tune() failed");
130:         return status;
131:     }
132: 
133:     /* Set MT, NT & NTRHS */
134:     NB = PLASMA_NB;
135:     NT = (N%NB==0) ? (N/NB) : (N/NB+1);
136:     MT = (M%NB==0) ? (M/NB) : (M/NB+1);
137:     NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
138: 
139:     /* Allocate memory for matrices in block layout */
140:     Abdl = (double *)plasma_shared_alloc(plasma, MT*NT*PLASMA_NBNBSIZE, PlasmaRealDouble);
141:     Tbdl = (double *)plasma_shared_alloc(plasma, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
142:     Bbdl = (double *)plasma_shared_alloc(plasma, max(MT, NT)*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
143:     if (Abdl == NULL || Tbdl == NULL || Bbdl == NULL) {
144:         plasma_error("PLASMA_dgels", "plasma_shared_alloc() failed");
145:         plasma_shared_free(plasma, Abdl);
146:         plasma_shared_free(plasma, Tbdl);
147:         plasma_shared_free(plasma, Bbdl);
148:         return PLASMA_ERR_OUT_OF_RESOURCES;
149:     }
150: 
151:     if (M >= N) {
152:         descA = plasma_desc_init(
153:             Abdl, PlasmaRealDouble,
154:             PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
155:             M, N, 0, 0, M, N);
156: 
157:         descB = plasma_desc_init(
158:             Bbdl, PlasmaRealDouble,
159:             PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
160:             M, NRHS, 0, 0, M, NRHS);
161: 
162:         descT = plasma_desc_init(
163:             Tbdl, PlasmaRealDouble,
164:             PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
165:             M, N, 0, 0, M, N);
166: 
167:         plasma_parallel_call_3(plasma_lapack_to_tile,
168:             double*, A,
169:             int, LDA,
170:             PLASMA_desc, descA);
171: 
172:         plasma_parallel_call_3(plasma_lapack_to_tile,
173:             double*, B,
174:             int, LDB,
175:             PLASMA_desc, descB);
176:     }
177:     else {
178:         descA = plasma_desc_init(
179:             Abdl, PlasmaRealDouble,
180:             PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
181:             M, N, 0, 0, M, N);
182: 
183:         descB = plasma_desc_init(
184:             Bbdl, PlasmaRealDouble,
185:             PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
186:             N, NRHS, 0, 0, N, NRHS);
187: 
188:         descT = plasma_desc_init(
189:             Tbdl, PlasmaRealDouble,
190:             PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
191:             M, N, 0, 0, M, N);
192: 
193:         plasma_parallel_call_3(plasma_lapack_to_tile,
194:             double*, A,
195:             int, LDA,
196:             PLASMA_desc, descA);
197: 
198:         plasma_parallel_call_3(plasma_lapack_to_tile,
199:             double*, B,
200:             int, LDB,
201:             PLASMA_desc, descB);
202:     }
203: 
204:     /* Call the native interface */
205:     status = PLASMA_dgels_Tile(PlasmaNoTrans, &descA, &descT, &descB);
206: 
207:     if (status == PLASMA_SUCCESS) {
208:         if (M >= N) {
209:             /* Return T to the user */
210:             plasma_memcpy(T, Tbdl, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
211: 
212:             plasma_parallel_call_3(plasma_tile_to_lapack,
213:                 PLASMA_desc, descA,
214:                 double*, A,
215:                 int, LDA);
216: 
217:             plasma_parallel_call_3(plasma_tile_to_lapack,
218:                 PLASMA_desc, descB,
219:                 double*, B,
220:                 int, LDB);
221:         }
222:         else {
223:             /* Return T to the user */
224:             plasma_memcpy(T, Tbdl, MT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
225: 
226:             plasma_parallel_call_3(plasma_tile_to_lapack,
227:                 PLASMA_desc, descA,
228:                 double*, A,
229:                 int, LDA);
230: 
231:             plasma_parallel_call_3(plasma_tile_to_lapack,
232:                 PLASMA_desc, descB,
233:                 double*, B,
234:                 int, LDB);
235:         }
236:     }
237:     plasma_shared_free(plasma, Abdl);
238:     plasma_shared_free(plasma, Tbdl);
239:     plasma_shared_free(plasma, Bbdl);
240:     return status;
241: }
242: 
243: /* /////////////////////////// P /// U /// R /// P /// O /// S /// E /////////////////////////// */
244: // PLASMA_dgels_Tile - solves overdetermined or underdetermined linear systems involving an M-by-N
245: // matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
246: // The following options are provided:
247: //
248: // 1. trans = PlasmaNoTrans and M >= N: find the least squares solution of an overdetermined
249: //    system, i.e., solve the least squares problem: minimize || B - A*X ||.
250: //
251: // 2. trans = PlasmaNoTrans and M < N:  find the minimum norm solution of an underdetermined
252: //    system A * X = B.
253: //
254: // Several right hand side vectors B and solution vectors X can be handled in a single call;
255: // they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
256: // solution
257: // matrix X.
258: // All matrices are passed through descriptors. All dimensions are taken from the descriptors.
259: 
260: /* ///////////////////// A /// R /// G /// U /// M /// E /// N /// T /// S ///////////////////// */
261: // trans    PLASMA_enum (IN)
262: //          Intended usage:
263: //          = PlasmaNoTrans:   the linear system involves A;
264: //          = PlasmaTrans: the linear system involves A**T.
265: //          Currently only PlasmaNoTrans is supported.
266: //
267: // A        double* (INOUT)
268: //          On entry, the M-by-N matrix A.
269: //          On exit,
270: //          if M >= N, A is overwritten by details of its QR factorization as returned by
271: //                     PLASMA_dgeqrf;
272: //          if M < N, A is overwritten by details of its LQ factorization as returned by
273: //                      PLASMA_dgelqf.
274: //
275: // T        double* (OUT)
276: //          On exit, auxiliary factorization data.
277: //
278: // B        double* (INOUT)
279: //          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
280: //          On exit, if return value = 0, B is overwritten by the solution vectors, stored
281: //          columnwise:
282: //          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
283: //          sum of squares for the solution in each column is given by the sum of squares of the
284: //          modulus of elements N+1 to M in that column;
285: //          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
286: 
287: /* ///////////// R /// E /// T /// U /// R /// N /////// V /// A /// L /// U /// E ///////////// */
288: //          = 0: successful exit
289: 
290: /* //////////////////////////////////// C /// O /// D /// E //////////////////////////////////// */
291: int PLASMA_dgels_Tile(PLASMA_enum trans, PLASMA_desc *A, PLASMA_desc *T, PLASMA_desc *B)
292: {
293:     PLASMA_desc descA = *A;
294:     PLASMA_desc descT = *T;
295:     PLASMA_desc descB = *B;
296:     plasma_context_t *plasma;
297: 
298:     plasma = plasma_context_self();
299:     if (plasma == NULL) {
300:         plasma_fatal_error("PLASMA_dgels_Tile", "PLASMA not initialized");
301:         return PLASMA_ERR_NOT_INITIALIZED;
302:     }
303:     /* Check descriptors for correctness */
304:     if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
305:         plasma_error("PLASMA_dgels_Tile", "invalid first descriptor");
306:         return PLASMA_ERR_ILLEGAL_VALUE;
307:     }
308:     if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
309:         plasma_error("PLASMA_dgels_Tile", "invalid second descriptor");
310:         return PLASMA_ERR_ILLEGAL_VALUE;
311:     }
312:     if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
313:         plasma_error("PLASMA_dgels_Tile", "invalid third descriptor");
314:         return PLASMA_ERR_ILLEGAL_VALUE;
315:     }
316:     /* Check input arguments */
317:     if (descA.nb != descA.mb || descB.nb != descB.mb) {
318:         plasma_error("PLASMA_dgels_Tile", "only square tiles supported");
319:         return PLASMA_ERR_ILLEGAL_VALUE;
320:     }
321:     if (trans != PlasmaNoTrans) {
322:         plasma_error("PLASMA_dgels_Tile", "only PlasmaNoTrans supported");
323:         return PLASMA_ERR_NOT_SUPPORTED;
324:     }
325:     /* Quick return  - currently NOT equivalent to LAPACK's:
326:     if (min(M, min(N, NRHS)) == 0) {
327:         for (i = 0; i < max(M, N); i++)
328:             for (j = 0; j < NRHS; j++)
329:                 B[j*LDB+i] = 0.0;
330:         return PLASMA_SUCCESS;
331:     }
332: */
333:     if (descA.m >= descA.n) {
334:         plasma_parallel_call_2(plasma_pdgeqrf,
335:             PLASMA_desc, descA,
336:             PLASMA_desc, descT);
337: 
338:         plasma_parallel_call_3(plasma_pdormqr,
339:             PLASMA_desc, descA,
340:             PLASMA_desc, descB,
341:             PLASMA_desc, descT);
342: 
343:         plasma_parallel_call_7(plasma_pdtrsm,
344:             PLASMA_enum, PlasmaLeft,
345:             PLASMA_enum, PlasmaUpper,
346:             PLASMA_enum, PlasmaNoTrans,
347:             PLASMA_enum, PlasmaNonUnit,
348:             double, 1.0,
349:             PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.n, descA.n),
350:             PLASMA_desc, plasma_desc_submatrix(descB, 0, 0, descA.n, descB.n));
351:     }
352:     else {
353:         plasma_parallel_call_1(plasma_tile_zero,
354:             PLASMA_desc, plasma_desc_submatrix(descB, descA.m, 0, descA.n-descA.m, descB.n));
355: 
356:         plasma_parallel_call_2(plasma_pdgelqf,
357:             PLASMA_desc, descA,
358:             PLASMA_desc, descT);
359: 
360:         plasma_parallel_call_7(plasma_pdtrsm,
361:             PLASMA_enum, PlasmaLeft,
362:             PLASMA_enum, PlasmaLower,
363:             PLASMA_enum, PlasmaNoTrans,
364:             PLASMA_enum, PlasmaNonUnit,
365:             double, 1.0,
366:             PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.m, descA.m),
367:             PLASMA_desc, plasma_desc_submatrix(descB, 0, 0, descA.m, descB.n));
368: 
369:         plasma_parallel_call_3(plasma_pdormlq,
370:             PLASMA_desc, descA,
371:             PLASMA_desc, descB,
372:             PLASMA_desc, descT);
373:     }
374:     return PLASMA_SUCCESS;
375: }
376: