PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dsungesv.c
Go to the documentation of this file.
1 
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <math.h>
18 #include <lapacke.h>
19 #include "common.h"
20 
21 #define PLASMA_dlag2s(_descA, _descSB) \
22  plasma_parallel_call_4(plasma_pdlag2s, \
23  PLASMA_desc, (_descA), \
24  PLASMA_desc, (_descSB), \
25  PLASMA_sequence*, sequence, \
26  PLASMA_request*, request)
27 
28 #define PLASMA_slag2d(_descSA, _descB) \
29  plasma_parallel_call_4(plasma_pslag2d, \
30  PLASMA_desc, (_descSA), \
31  PLASMA_desc, (_descB), \
32  PLASMA_sequence*, sequence, \
33  PLASMA_request*, request)
34 
35 #define PLASMA_dlange(_norm, _descA, _result, _work) \
36  _result = 0; \
37  plasma_parallel_call_6(plasma_pdlange, \
38  PLASMA_enum, (_norm), \
39  PLASMA_desc, (_descA), \
40  double*, (_work), \
41  double*, &(_result), \
42  PLASMA_sequence*, sequence, \
43  PLASMA_request*, request);
44 
45 #define PLASMA_dlacpy(_descA, _descB) \
46  plasma_parallel_call_5(plasma_pdlacpy, \
47  PLASMA_enum, PlasmaUpperLower, \
48  PLASMA_desc, (_descA), \
49  PLASMA_desc, (_descB), \
50  PLASMA_sequence*, sequence, \
51  PLASMA_request*, request)
52 
53 #define PLASMA_dgeadd(_alpha, _descA, _descB) \
54  plasma_parallel_call_5(plasma_pdgeadd, \
55  double, (_alpha), \
56  PLASMA_desc, (_descA), \
57  PLASMA_desc, (_descB), \
58  PLASMA_sequence*, sequence, \
59  PLASMA_request*, request)
60 
61 /***************************************************************************/
163 int PLASMA_dsungesv(PLASMA_enum trans, int N, int NRHS,
164  double *A, int LDA,
165  double *B, int LDB,
166  double *X, int LDX, int *ITER)
167 {
168  int NB;
169  int status;
170  PLASMA_desc descA;
171  PLASMA_desc descB;
172  PLASMA_desc *descT;
173  PLASMA_desc descX;
175  PLASMA_sequence *sequence = NULL;
177 
178  plasma = plasma_context_self();
179  if (plasma == NULL) {
180  plasma_fatal_error("PLASMA_dsungesv", "PLASMA not initialized");
182  }
183  /* Check input arguments */
184  if (trans != PlasmaNoTrans) {
185  plasma_error("PLASMA_dsungesv", "only PlasmaNoTrans supported");
187  }
188  if (N < 0) {
189  plasma_error("PLASMA_dsungesv", "illegal value of N");
190  return -2;
191  }
192  if (NRHS < 0) {
193  plasma_error("PLASMA_dsungesv", "illegal value of NRHS");
194  return -3;
195  }
196  if (LDA < max(1, N)) {
197  plasma_error("PLASMA_dsungesv", "illegal value of LDA");
198  return -5;
199  }
200  if (LDB < max(1, N)) {
201  plasma_error("PLASMA_dsungesv", "illegal value of LDB");
202  return -8;
203  }
204  if (LDX < max(1, N)) {
205  plasma_error("PLASMA_dsungesv", "illegal value of LDX");
206  return -9;
207  }
208 
209  /* Quick return */
210  if ( N == 0 )
211  return PLASMA_SUCCESS;
212 
213  /* Tune NB & IB depending on M, N & NRHS; Set NBNB */
214  status = plasma_tune(PLASMA_FUNC_DSGELS, N, N, NRHS);
215  if (status != PLASMA_SUCCESS) {
216  plasma_error("PLASMA_dsungesv", "plasma_tune() failed");
217  return status;
218  }
219 
220  NB = PLASMA_NB;
221 
222  plasma_sequence_create(plasma, &sequence);
223 
224  /* DOUBLE PRECISION INITIALIZATION */
226  plasma_dooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N , plasma_desc_mat_free(&(descA)) );
227  plasma_dooplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)) );
228  plasma_ddesc_alloc( descX, NB, NB, N, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)); plasma_desc_mat_free(&(descX)) );
229  } else {
230  plasma_diplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N );
231  plasma_diplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS);
232 
233  descX = plasma_desc_init(
234  PlasmaRealDouble, NB, NB, (NB*NB),
235  LDX, NRHS, 0, 0, N, NRHS);
236  descX.mat = X;
237  }
238 
239  /* Allocate workspace */
240  PLASMA_Alloc_Workspace_dgels_Tile(N, N, &descT);
241 
242  /* Call the native interface */
243  status = PLASMA_dsungesv_Tile_Async(PlasmaNoTrans, &descA, descT, &descB, &descX, ITER,
244  sequence, &request);
245 
246  if (status == PLASMA_SUCCESS) {
248  plasma_dooptile2lap( descX, X, NB, NB, LDX, NRHS );
250  plasma_desc_mat_free(&descA);
251  plasma_desc_mat_free(&descB);
252  plasma_desc_mat_free(&descX);
253  } else {
254  plasma_diptile2lap( descA, A, NB, NB, LDA, N );
255  plasma_diptile2lap( descB, B, NB, NB, LDB, NRHS );
256  plasma_diptile2lap( descX, X, NB, NB, LDX, NRHS );
258  }
259  }
260 
262  plasma_sequence_destroy(plasma, sequence);
263  return status;
264 }
265 
266 /***************************************************************************/
329  PLASMA_desc *B, PLASMA_desc *X, int *ITER)
330 {
332  PLASMA_sequence *sequence = NULL;
334  int status;
335 
336  plasma = plasma_context_self();
337  if (plasma == NULL) {
338  plasma_fatal_error("PLASMA_dsungesv_Tile", "PLASMA not initialized");
340  }
341  plasma_sequence_create(plasma, &sequence);
342  status = PLASMA_dsungesv_Tile_Async(trans, A, T, B, X, ITER, sequence, &request);
343  if (status != PLASMA_SUCCESS)
344  return status;
346  status = sequence->status;
347  plasma_sequence_destroy(plasma, sequence);
348  return status;
349 }
350 
351 /***************************************************************************/
379  PLASMA_desc *B, PLASMA_desc *X, int *ITER,
380  PLASMA_sequence *sequence, PLASMA_request *request)
381 {
382  int N, NB, IB;
383  PLASMA_desc descA = *A;
384  PLASMA_desc descT = *T;
385  PLASMA_desc descB = *B;
386  PLASMA_desc descX = *X;
387  PLASMA_desc descR, descSA, descST, descSX;
389  double *work;
390 
391  const int itermax = 30;
392  const double bwdmax = 1.0;
393  const double negone = -1.0;
394  const double one = 1.0;
395  int iiter;
396  double Anorm, cte, eps, Rnorm, Xnorm;
397  *ITER=0;
398 
399  plasma = plasma_context_self();
400  if (plasma == NULL) {
401  plasma_fatal_error("PLASMA_dsungesv_Tile", "PLASMA not initialized");
403  }
404  if (sequence == NULL) {
405  plasma_fatal_error("PLASMA_dsungesv_Tile", "NULL sequence");
406  return PLASMA_ERR_UNALLOCATED;
407  }
408  if (request == NULL) {
409  plasma_fatal_error("PLASMA_dsungesv_Tile", "NULL request");
410  return PLASMA_ERR_UNALLOCATED;
411  }
412  /* Check sequence status */
413  if (sequence->status == PLASMA_SUCCESS)
414  request->status = PLASMA_SUCCESS;
415  else
416  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
417  /* Check descriptors for correctness */
418  if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
419  plasma_error("PLASMA_dsungesv_Tile", "invalid first descriptor");
421  }
422  if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
423  plasma_error("PLASMA_dsungesv_Tile", "invalid second descriptor");
425  }
426  if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
427  plasma_error("PLASMA_dsungesv_Tile", "invalid third descriptor");
429  }
430  if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
431  plasma_error("PLASMA_dsungesv_Tile", "invalid fourth descriptor");
433  }
434  /* Check input arguments */
435  if ( (descA.nb != descA.mb) || (descB.nb != descB.mb) || (descX.nb != descX.mb) ||
436  (descA.mb != descB.mb) || (descB.mb != descX.mb) ) {
437  plasma_error("PLASMA_dsungesv_Tile", "only square tiles of same size are supported");
439  }
440  if (trans != PlasmaNoTrans) {
441  plasma_error("PLASMA_dsungesv_Tile", "only PlasmaNoTrans supported");
443  }
444 
445  /* Set N, NRHS, NB */
446  N = descA.m;
447  NB = descA.nb;
448  IB = descT.mb;
449 
450  work = (double *)plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble);
451  if (work == NULL) {
452  plasma_error("PLASMA_dsungesv", "plasma_shared_alloc() failed");
453  plasma_shared_free(plasma, work);
455  }
456 
457  plasma_ddesc_alloc( descR, NB, NB, descB.m, descB.n, 0, 0, descB.m, descB.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR) );
458  plasma_sdesc_alloc( descSA, NB, NB, descA.m, descA.n, 0, 0, descA.m, descA.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR); plasma_desc_mat_free(&descSA) );
459  plasma_sdesc_alloc( descST, IB, NB, descT.m, descT.n, 0, 0, descT.m, descT.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR); plasma_desc_mat_free(&descSA); plasma_desc_mat_free(&descST) );
460  plasma_sdesc_alloc( descSX, NB, NB, descX.m, descX.n, 0, 0, descX.m, descX.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR); plasma_desc_mat_free(&descSA); plasma_desc_mat_free(&descST); plasma_desc_mat_free(&descSX) );
461 
462  /* Compute some constants */
463  PLASMA_dlange(PlasmaInfNorm, descA, Anorm, work);
464  eps = LAPACKE_dlamch_work('e');
465 
466  /* Convert B from double precision to single precision and store
467  the result in SX. */
468  PLASMA_dlag2s(descB, descSX);
469  if (sequence->status != PLASMA_SUCCESS)
470  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
471 
472  /* Convert A from double precision to single precision and store
473  the result in SA. */
474  PLASMA_dlag2s(descA, descSA);
475  if (sequence->status != PLASMA_SUCCESS)
476  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
477 
478  /* Compute the QR factorization of SA */
480  PLASMA_desc, descSA,
481  PLASMA_desc, descST,
482  PLASMA_sequence*, sequence,
483  PLASMA_request*, request);
484 
485  /* Compute the solve in simple */
489  PLASMA_desc, descSA,
490  PLASMA_desc, descSX,
491  PLASMA_desc, descST,
492  PLASMA_sequence*, sequence,
493  PLASMA_request*, request);
494 
500  float, 1.0,
501  PLASMA_desc, descSA,
502  PLASMA_desc, descSX,
503  PLASMA_sequence*, sequence,
504  PLASMA_request*, request);
505 
506  /* Convert SX back to double precision */
507  PLASMA_slag2d(descSX, descX);
508 
509  /* Compute R = B - AX. */
510  PLASMA_dlacpy(descB, descR);
511 
515  double, negone,
516  PLASMA_desc, descA,
517  PLASMA_desc, descX,
518  double, one,
519  PLASMA_desc, descR,
520  PLASMA_sequence*, sequence,
521  PLASMA_request*, request);
522 
523  /* Check whether the NRHS normwise backward error satisfies the
524  stopping criterion. If yes return. Note that ITER=0 (already set). */
525  PLASMA_dlange(PlasmaInfNorm, descX, Xnorm, work);
526  PLASMA_dlange(PlasmaInfNorm, descR, Rnorm, work);
527 
528  /* Wait the end of Anorm, Xnorm and Bnorm computations */
530 
531  cte = Anorm*eps*((double) N)*bwdmax;
532  if (Rnorm < Xnorm * cte){
533  /* The NRHS normwise backward errors satisfy the
534  stopping criterion. We are good to exit. */
535  plasma_desc_mat_free(&descSA);
536  plasma_desc_mat_free(&descST);
537  plasma_desc_mat_free(&descSX);
538  plasma_desc_mat_free(&descR);
539  plasma_shared_free(plasma, work);
540  return PLASMA_SUCCESS;
541  }
542 
543  /* Iterative refinement */
544  for (iiter = 0; iiter < itermax; iiter++){
545 
546  /* Convert R from double precision to single precision
547  and store the result in SX. */
548  PLASMA_dlag2s(descR, descSX);
549 
553  PLASMA_desc, descSA,
554  PLASMA_desc, descSX,
555  PLASMA_desc, descST,
556  PLASMA_sequence*, sequence,
557  PLASMA_request*, request);
558 
564  float, (float)1.0,
565  PLASMA_desc, descSA,
566  PLASMA_desc, descSX,
567  PLASMA_sequence*, sequence,
568  PLASMA_request*, request);
569 
570  /* Convert SX back to double precision and update the current
571  iterate. */
572  PLASMA_slag2d(descSX, descR);
573  PLASMA_dgeadd(one, descR, descX);
574 
575  /* Compute R = B - AX. */
576  PLASMA_dlacpy(descB,descR);
580  double, negone,
581  PLASMA_desc, descA,
582  PLASMA_desc, descX,
583  double, one,
584  PLASMA_desc, descR,
585  PLASMA_sequence*, sequence,
586  PLASMA_request*, request);
587 
588  /* Check whether the NRHS normwise backward errors satisfy the
589  stopping criterion. If yes, set ITER=IITER>0 and return. */
590  PLASMA_dlange(PlasmaInfNorm, descX, Xnorm, work);
591  PLASMA_dlange(PlasmaInfNorm, descR, Rnorm, work);
592 
593  /* Wait the end of Xnorm and Bnorm computations */
595 
596  if (Rnorm < Xnorm * cte){
597  /* The NRHS normwise backward errors satisfy the
598  stopping criterion. We are good to exit. */
599  *ITER = iiter;
600 
601  plasma_desc_mat_free(&descSA);
602  plasma_desc_mat_free(&descST);
603  plasma_desc_mat_free(&descSX);
604  plasma_desc_mat_free(&descR);
605  plasma_shared_free(plasma, work);
606  return PLASMA_SUCCESS;
607  }
608  }
609 
610  /* We have performed ITER=itermax iterations and never satisified
611  the stopping criterion, set up the ITER flag accordingly and
612  follow up on double precision routine. */
613  *ITER = -itermax - 1;
614 
615  plasma_desc_mat_free(&descSA);
616  plasma_desc_mat_free(&descST);
617  plasma_desc_mat_free(&descSX);
618  plasma_desc_mat_free(&descR);
619  plasma_shared_free(plasma, work);
620 
621  /* Single-precision iterative refinement failed to converge to a
622  satisfactory solution, so we restart to double precision. */
623  PLASMA_dlacpy(descB, descX);
624 
626  PLASMA_desc, descA,
627  PLASMA_desc, descT,
628  PLASMA_sequence*, sequence,
629  PLASMA_request*, request);
630 
634  PLASMA_desc, descA,
635  PLASMA_desc, descX,
636  PLASMA_desc, descT,
637  PLASMA_sequence*, sequence,
638  PLASMA_request*, request);
639 
645  double, (double)1.0,
646  PLASMA_desc, descA,
647  PLASMA_desc, descX,
648  PLASMA_sequence*, sequence,
649  PLASMA_request*, request);
650 
651  return PLASMA_SUCCESS;
652 }