PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zcgesv.c
Go to the documentation of this file.
1 
16 #include <stdlib.h>
17 #include <math.h>
18 #include <lapacke.h>
19 #include "common.h"
20 
21 #define PLASMA_zlag2c(_descA, _descSB) \
22  plasma_parallel_call_4(plasma_pzlag2c, \
23  PLASMA_desc, (_descA), \
24  PLASMA_desc, (_descSB), \
25  PLASMA_sequence*, sequence, \
26  PLASMA_request*, request)
27 
28 #define PLASMA_clag2z(_descSA, _descB) \
29  plasma_parallel_call_4(plasma_pclag2z, \
30  PLASMA_desc, (_descSA), \
31  PLASMA_desc, (_descB), \
32  PLASMA_sequence*, sequence, \
33  PLASMA_request*, request)
34 
35 #define PLASMA_zlange(_norm, _descA, _result, _work) \
36  _result = 0; \
37  plasma_parallel_call_6(plasma_pzlange, \
38  PLASMA_enum, (_norm), \
39  PLASMA_desc, (_descA), \
40  double*, (_work), \
41  double*, &(_result), \
42  PLASMA_sequence*, sequence, \
43  PLASMA_request*, request);
44 
45 #define PLASMA_zlacpy(_descA, _descB) \
46  plasma_parallel_call_5(plasma_pzlacpy, \
47  PLASMA_enum, PlasmaUpperLower, \
48  PLASMA_desc, (_descA), \
49  PLASMA_desc, (_descB), \
50  PLASMA_sequence*, sequence, \
51  PLASMA_request*, request)
52 
53 #define PLASMA_zgeadd(_alpha, _descA, _descB) \
54  plasma_parallel_call_5(plasma_pzgeadd, \
55  PLASMA_Complex64_t, (_alpha), \
56  PLASMA_desc, (_descA), \
57  PLASMA_desc, (_descB), \
58  PLASMA_sequence*, sequence, \
59  PLASMA_request*, request)
60 
61 
62 #define PLASMA_zgetrs(_A, _IPIV, _X) \
63  { \
64  plasma_dynamic_call_3( \
65  plasma_pzbarrier_tl2pnl, \
66  PLASMA_desc, (_X), \
67  PLASMA_sequence*, sequence, \
68  PLASMA_request*, request); \
69  \
70  plasma_dynamic_call_5( \
71  plasma_pzlaswp, \
72  PLASMA_desc, (_X), \
73  int *, (_IPIV), \
74  int, 1, \
75  PLASMA_sequence*, sequence, \
76  PLASMA_request*, request); \
77  \
78  plasma_parallel_call_9( \
79  plasma_pztrsm, \
80  PLASMA_enum, PlasmaLeft, \
81  PLASMA_enum, PlasmaLower, \
82  PLASMA_enum, PlasmaNoTrans, \
83  PLASMA_enum, PlasmaUnit, \
84  PLASMA_Complex64_t, 1.0, \
85  PLASMA_desc, (_A), \
86  PLASMA_desc, (_X), \
87  PLASMA_sequence*, sequence, \
88  PLASMA_request*, request); \
89  \
90  plasma_parallel_call_9( \
91  plasma_pztrsm, \
92  PLASMA_enum, PlasmaLeft, \
93  PLASMA_enum, PlasmaUpper, \
94  PLASMA_enum, PlasmaNoTrans, \
95  PLASMA_enum, PlasmaNonUnit, \
96  PLASMA_Complex64_t, 1.0, \
97  PLASMA_desc, (_A), \
98  PLASMA_desc, (_X), \
99  PLASMA_sequence*, sequence, \
100  PLASMA_request*, request); \
101  }
102 
103 #define PLASMA_cgetrs(_A, _IPIV, _X) \
104  { \
105  plasma_dynamic_call_3( \
106  plasma_pcbarrier_tl2pnl, \
107  PLASMA_desc, (_X), \
108  PLASMA_sequence*, sequence, \
109  PLASMA_request*, request); \
110  \
111  plasma_dynamic_call_5( \
112  plasma_pclaswp, \
113  PLASMA_desc, (_X), \
114  int *, (_IPIV), \
115  int, 1, \
116  PLASMA_sequence*, sequence, \
117  PLASMA_request*, request); \
118  \
119  plasma_parallel_call_9( \
120  plasma_pctrsm, \
121  PLASMA_enum, PlasmaLeft, \
122  PLASMA_enum, PlasmaLower, \
123  PLASMA_enum, PlasmaNoTrans, \
124  PLASMA_enum, PlasmaUnit, \
125  PLASMA_Complex32_t, 1.0, \
126  PLASMA_desc, (_A), \
127  PLASMA_desc, (_X), \
128  PLASMA_sequence*, sequence, \
129  PLASMA_request*, request); \
130  \
131  plasma_parallel_call_9( \
132  plasma_pctrsm, \
133  PLASMA_enum, PlasmaLeft, \
134  PLASMA_enum, PlasmaUpper, \
135  PLASMA_enum, PlasmaNoTrans, \
136  PLASMA_enum, PlasmaNonUnit, \
137  PLASMA_Complex32_t, 1.0, \
138  PLASMA_desc, (_A), \
139  PLASMA_desc, (_X), \
140  PLASMA_sequence*, sequence, \
141  PLASMA_request*, request); \
142  }
143 
144 /***************************************************************************/
227 int PLASMA_zcgesv(int N, int NRHS,
228  PLASMA_Complex64_t *A, int LDA, int * IPIV,
229  PLASMA_Complex64_t *B, int LDB,
230  PLASMA_Complex64_t *X, int LDX, int *ITER)
231 {
232  int NB;
233  int status;
234  PLASMA_desc descA;
235  PLASMA_desc descB;
236  PLASMA_desc descX;
238  PLASMA_sequence *sequence = NULL;
240 
241  plasma = plasma_context_self();
242  if (plasma == NULL) {
243  plasma_fatal_error("PLASMA_zcgesv", "PLASMA not initialized");
245  }
246  /* Check input arguments */
247  if (N < 0) {
248  plasma_error("PLASMA_zcgesv", "illegal value of N");
249  return -1;
250  }
251  if (NRHS < 0) {
252  plasma_error("PLASMA_zcgesv", "illegal value of NRHS");
253  return -2;
254  }
255  if (LDA < max(1, N)) {
256  plasma_error("PLASMA_zcgesv", "illegal value of LDA");
257  return -4;
258  }
259  if (LDB < max(1, N)) {
260  plasma_error("PLASMA_zcgesv", "illegal value of LDB");
261  return -8;
262  }
263  if (LDX < max(1, N)) {
264  plasma_error("PLASMA_zcgesv", "illegal value of LDX");
265  return -10;
266  }
267  /* Quick return */
268  if (min(N, NRHS) == 0)
269  return PLASMA_SUCCESS;
270 
271  /* Tune NB & IB depending on M, N & NRHS; Set NBNB */
272  status = plasma_tune(PLASMA_FUNC_ZCGESV, N, N, NRHS);
273  if (status != PLASMA_SUCCESS) {
274  plasma_error("PLASMA_zcgesv", "plasma_tune() failed");
275  return status;
276  }
277 
278  NB = PLASMA_NB;
279 
280  plasma_sequence_create(plasma, &sequence);
281 
282  /* DOUBLE PRECISION INITIALIZATION */
284  plasma_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N , plasma_desc_mat_free(&(descA)) );
285  plasma_zooplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)) );
286  plasma_zdesc_alloc( descX, NB, NB, N, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)); plasma_desc_mat_free(&(descX)) );
287  } else {
288  plasma_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N );
289  plasma_ziplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS);
290 
291  descX = plasma_desc_init(
292  PlasmaComplexDouble, NB, NB, (NB*NB),
293  LDX, NRHS, 0, 0, N, NRHS);
294  descX.mat = X;
295  }
296 
297  /* Call the native interface */
298  status = PLASMA_zcgesv_Tile_Async(&descA, IPIV, &descB, &descX, ITER, sequence, &request);
299 
300  if (status == PLASMA_SUCCESS) {
302  plasma_zooptile2lap( descX, X, NB, NB, LDX, NRHS );
304  plasma_desc_mat_free(&descA);
305  plasma_desc_mat_free(&descB);
306  plasma_desc_mat_free(&descX);
307  } else {
308  plasma_ziptile2lap( descA, A, NB, NB, LDA, N );
309  plasma_ziptile2lap( descB, B, NB, NB, LDB, NRHS );
310  plasma_ziptile2lap( descX, X, NB, NB, LDX, NRHS );
312  }
313  }
314 
315  status = sequence->status;
316  plasma_sequence_destroy(plasma, sequence);
317  return status;
318 }
319 
320 
321 /***************************************************************************/
375  PLASMA_desc *B, PLASMA_desc *X, int *ITER)
376 {
378  PLASMA_sequence *sequence = NULL;
380  int status;
381 
382  plasma = plasma_context_self();
383  if (plasma == NULL) {
384  plasma_fatal_error("PLASMA_zcgesv_Tile", "PLASMA not initialized");
386  }
387  plasma_sequence_create(plasma, &sequence);
388  status = PLASMA_zcgesv_Tile_Async(A, IPIV, B, X, ITER, sequence, &request);
389  if (status != PLASMA_SUCCESS)
390  return status;
392  status = sequence->status;
393  plasma_sequence_destroy(plasma, sequence);
394  return status;
395 }
396 
397 /***************************************************************************/
425  PLASMA_desc *B, PLASMA_desc *X, int *ITER,
426  PLASMA_sequence *sequence, PLASMA_request *request)
427 {
428  int N, NB;
429  PLASMA_desc descA = *A;
430  PLASMA_desc descB = *B;
431  PLASMA_desc descX = *X;
432  PLASMA_desc descR, descSA, descSX;
434  double *work;
435 
436  const int itermax = 30;
437  const double bwdmax = 1.0;
438  const PLASMA_Complex64_t negone = -1.0;
439  const PLASMA_Complex64_t one = 1.0;
440  int iiter;
441  double Anorm, cte, eps, Rnorm, Xnorm;
442  *ITER=0;
443 
444  plasma = plasma_context_self();
445  if (plasma == NULL) {
446  plasma_fatal_error("PLASMA_zcgesv_Tile", "PLASMA not initialized");
448  }
449  if (sequence == NULL) {
450  plasma_fatal_error("PLASMA_zcgesv_Tile", "NULL sequence");
451  return PLASMA_ERR_UNALLOCATED;
452  }
453  if (request == NULL) {
454  plasma_fatal_error("PLASMA_zcgesv_Tile", "NULL request");
455  return PLASMA_ERR_UNALLOCATED;
456  }
457  /* Check sequence status */
458  if (sequence->status == PLASMA_SUCCESS)
459  request->status = PLASMA_SUCCESS;
460  else
461  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
462 
463  /* Check descriptors for correctness */
464  if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
465  plasma_error("PLASMA_zcgesv_Tile", "invalid first descriptor");
467  }
468  if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
469  plasma_error("PLASMA_zcgesv_Tile", "invalid third descriptor");
471  }
472  if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
473  plasma_error("PLASMA_zcgesv_Tile", "invalid fourth descriptor");
475  }
476  /* Check input arguments */
477  if (descA.nb != descA.mb || descB.nb != descB.mb || descX.nb != descX.mb) {
478  plasma_error("PLASMA_zcgesv_Tile", "only square tiles supported");
480  }
481 
482  /* Set N, NRHS, NT */
483  N = descA.m;
484  NB = descA.nb;
485 
486  work = (double *)plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble);
487  if (work == NULL) {
488  plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
489  plasma_shared_free(plasma, work);
491  }
492 
493  plasma_zdesc_alloc( descR, NB, NB, descB.m, descB.n, 0, 0, descB.m, descB.n,
494  plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR) );
495  plasma_cdesc_alloc( descSA, NB, NB, descA.m, descA.n, 0, 0, descA.m, descA.n,
496  plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR);
497  plasma_desc_mat_free(&descSA) );
498  plasma_cdesc_alloc( descSX, NB, NB, descX.m, descX.n, 0, 0, descX.m, descX.n,
499  plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR);
500  plasma_desc_mat_free(&descSA); plasma_desc_mat_free(&descSX) );
501 
502  /* Compute some constants */
503  PLASMA_zlange(PlasmaInfNorm, descA, Anorm, work);
504  eps = LAPACKE_dlamch_work('e');
505 
506  /* Convert B from double precision to single precision and store
507  the result in SX. */
508  PLASMA_zlag2c(descB, descSX);
509  if (sequence->status != PLASMA_SUCCESS)
510  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
511 
512  /* Convert A from double precision to single precision and store
513  the result in SA. */
514  PLASMA_zlag2c(descA, descSA);
515  if (sequence->status != PLASMA_SUCCESS)
516  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
517 
518  /* Clear IPIV and Lbdl */
519  plasma_memzero(IPIV, N, PlasmaInteger);
520 
521  /* Compute the LU factorization of SA */
523  plasma_pcbarrier_tl2pnl,
524  PLASMA_desc, descSA,
525  PLASMA_sequence*, sequence,
526  PLASMA_request*, request);
527 
528  plasma_dynamic_call_4(plasma_pcgetrf_rectil,
529  PLASMA_desc, descSA,
530  int*, IPIV,
531  PLASMA_sequence*, sequence,
532  PLASMA_request*, request);
533 
534  /* Solve the system SA*SX = SB */
535  PLASMA_cgetrs(descSA, IPIV, descSX);
536 
537  /* Convert SX back to double precision */
538  PLASMA_clag2z(descSX, descX);
539 
540  /* Compute R = B - AX. */
541  PLASMA_zlacpy(descB, descR);
545  PLASMA_Complex64_t, negone,
546  PLASMA_desc, descA,
547  PLASMA_desc, descX,
548  PLASMA_Complex64_t, one,
549  PLASMA_desc, descR,
550  PLASMA_sequence*, sequence,
551  PLASMA_request*, request);
552 
553  /* Check whether the NRHS normwise backward error satisfies the
554  stopping criterion. If yes return. Note that ITER=0 (already set). */
555  PLASMA_zlange(PlasmaMaxNorm, descX, Xnorm, work);
556  PLASMA_zlange(PlasmaMaxNorm, descR, Rnorm, work);
557 
558  /* Wait the end of Anorm, Xnorm and Bnorm computations */
560 
561  cte = Anorm*eps*sqrt((double) N)*bwdmax;
562  if (Rnorm < Xnorm * cte){
563  /* The NRHS normwise backward errors satisfy the
564  stopping criterion. We are good to exit. */
565  plasma_desc_mat_free(&descSA);
566  plasma_desc_mat_free(&descSX);
567  plasma_desc_mat_free(&descR);
568  plasma_shared_free(plasma, work);
569  return PLASMA_SUCCESS;
570  }
571 
572  /* Iterative refinement */
573  for (iiter = 0; iiter < itermax; iiter++){
574 
575  /* Convert R from double precision to single precision
576  and store the result in SX. */
577  PLASMA_zlag2c(descR, descSX);
578 
579  /* Solve the system SA*SX = SB */
580  PLASMA_cgetrs(descSA, IPIV, descSX);
581 
582  /* Convert SX back to double precision and update the current
583  iterate. */
584  PLASMA_clag2z(descSX, descR);
585  PLASMA_zgeadd(one, descR, descX);
586 
587  /* Compute R = B - AX. */
588  PLASMA_zlacpy(descB,descR);
592  PLASMA_Complex64_t, negone,
593  PLASMA_desc, descA,
594  PLASMA_desc, descX,
595  PLASMA_Complex64_t, one,
596  PLASMA_desc, descR,
597  PLASMA_sequence*, sequence,
598  PLASMA_request*, request);
599 
600  /* Check whether the NRHS normwise backward errors satisfy the
601  stopping criterion. If yes, set ITER=IITER>0 and return. */
602  PLASMA_zlange(PlasmaInfNorm, descX, Xnorm, work);
603  PLASMA_zlange(PlasmaInfNorm, descR, Rnorm, work);
604 
605  /* Wait the end of Xnorm and Bnorm computations */
607 
608  if (Rnorm < Xnorm * cte){
609  /* The NRHS normwise backward errors satisfy the
610  stopping criterion. We are good to exit. */
611  *ITER = iiter;
612 
613  plasma_desc_mat_free(&descSA);
614  plasma_desc_mat_free(&descSX);
615  plasma_desc_mat_free(&descR);
616  plasma_shared_free(plasma, work);
617  return PLASMA_SUCCESS;
618  }
619  }
620 
621  /* We have performed ITER=itermax iterations and never satisified
622  the stopping criterion, set up the ITER flag accordingly and
623  follow up on double precision routine. */
624  *ITER = -itermax - 1;
625 
626  plasma_desc_mat_free(&descSA);
627  plasma_desc_mat_free(&descSX);
628  plasma_desc_mat_free(&descR);
629  plasma_shared_free(plasma, work);
630 
631  /* Single-precision iterative refinement failed to converge to a
632  satisfactory solution, so we resort to double precision. */
633 
634  /* Clear IPIV and Lbdl */
635  plasma_memzero(IPIV, N, PlasmaInteger);
636 
637  plasma_dynamic_call_4(plasma_pzgetrf_rectil,
638  PLASMA_desc, descA,
639  int*, IPIV,
640  PLASMA_sequence*, sequence,
641  PLASMA_request*, request);
642 
643  PLASMA_zlacpy(descB, descX);
644 
645  PLASMA_zgetrs(descA, IPIV, descX);
646 
647  return PLASMA_SUCCESS;
648 }