PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zcposv.c
Go to the documentation of this file.
1 
15 #include <stdlib.h>
16 #include <math.h>
17 #include <lapacke.h>
18 #include "common.h"
19 
20 #define PLASMA_zlag2c(_descA, _descSB) \
21  plasma_parallel_call_4(plasma_pzlag2c, \
22  PLASMA_desc, (_descA), \
23  PLASMA_desc, (_descSB), \
24  PLASMA_sequence*, sequence, \
25  PLASMA_request*, request)
26 
27 #define PLASMA_clag2z(_descSA, _descB) \
28  plasma_parallel_call_4(plasma_pclag2z, \
29  PLASMA_desc, (_descSA), \
30  PLASMA_desc, (_descB), \
31  PLASMA_sequence*, sequence, \
32  PLASMA_request*, request)
33 
34 #define PLASMA_zlange(_norm, _descA, _result, _work) \
35  _result = 0; \
36  plasma_parallel_call_6(plasma_pzlange, \
37  PLASMA_enum, (_norm), \
38  PLASMA_desc, (_descA), \
39  double*, (_work), \
40  double*, &(_result), \
41  PLASMA_sequence*, sequence, \
42  PLASMA_request*, request);
43 
44 #define PLASMA_zlanhe(_norm, _uplo, _descA, _result, _work) \
45  _result = 0; \
46  plasma_parallel_call_7(plasma_pzlanhe, \
47  PLASMA_enum, (_norm), \
48  PLASMA_enum, (_uplo), \
49  PLASMA_desc, (_descA), \
50  double*, (_work), \
51  double*, &(_result), \
52  PLASMA_sequence*, sequence, \
53  PLASMA_request*, request);
54 
55 #define PLASMA_zlacpy(_descA, _descB) \
56  plasma_parallel_call_5(plasma_pzlacpy, \
57  PLASMA_enum, PlasmaUpperLower, \
58  PLASMA_desc, (_descA), \
59  PLASMA_desc, (_descB), \
60  PLASMA_sequence*, sequence, \
61  PLASMA_request*, request)
62 
63 #define PLASMA_zgeadd(_alpha, _descA, _descB) \
64  plasma_parallel_call_5(plasma_pzgeadd, \
65  PLASMA_Complex64_t, (_alpha), \
66  PLASMA_desc, (_descA), \
67  PLASMA_desc, (_descB), \
68  PLASMA_sequence*, sequence, \
69  PLASMA_request*, request)
70 
71 /***************************************************************************/
171 int PLASMA_zcposv(PLASMA_enum uplo, int N, int NRHS,
172  PLASMA_Complex64_t *A, int LDA,
173  PLASMA_Complex64_t *B, int LDB,
174  PLASMA_Complex64_t *X, int LDX, int *ITER)
175 {
176  int NB;
177  int status;
178  PLASMA_desc descA;
179  PLASMA_desc descB;
180  PLASMA_desc descX;
182  PLASMA_sequence *sequence = NULL;
184 
185  plasma = plasma_context_self();
186  if (plasma == NULL) {
187  plasma_fatal_error("PLASMA_zcposv", "PLASMA not initialized");
189  }
190  /* Check input arguments */
191  if (uplo != PlasmaUpper && uplo != PlasmaLower) {
192  plasma_error("PLASMA_zcposv", "illegal value of uplo");
193  return -1;
194  }
195  if (N < 0) {
196  plasma_error("PLASMA_zcposv", "illegal value of N");
197  return -2;
198  }
199  if (NRHS < 0) {
200  plasma_error("PLASMA_zcposv", "illegal value of NRHS");
201  return -3;
202  }
203  if (LDA < max(1, N)) {
204  plasma_error("PLASMA_zcposv", "illegal value of LDA");
205  return -5;
206  }
207  if (LDB < max(1, N)) {
208  plasma_error("PLASMA_zcposv", "illegal value of LDB");
209  return -7;
210  }
211  if (LDX < max(1, N)) {
212  plasma_error("PLASMA_zcposv", "illegal value of LDX");
213  return -10;
214  }
215  /* Quick return - currently NOT equivalent to LAPACK's
216  * LAPACK does not have such check for ZCPOSV */
217  if (min(N, NRHS) == 0)
218  return PLASMA_SUCCESS;
219 
220  /* Tune NB depending on M, N & NRHS; Set NBNBSIZE */
221  status = plasma_tune(PLASMA_FUNC_ZCPOSV, N, N, NRHS);
222  if (status != PLASMA_SUCCESS) {
223  plasma_error("PLASMA_zcposv", "plasma_tune() failed");
224  return status;
225  }
226 
227  NB = PLASMA_NB;
228 
229  plasma_sequence_create(plasma, &sequence);
230 
231  /* DOUBLE PRECISION INITIALIZATION */
233  plasma_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N , plasma_desc_mat_free(&(descA)) );
234  plasma_zooplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)) );
235  plasma_zdesc_alloc( descX, NB, NB, N, NRHS, 0, 0, N, NRHS, plasma_desc_mat_free(&(descA)); plasma_desc_mat_free(&(descB)); plasma_desc_mat_free(&(descX)) );
236  } else {
237  plasma_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N );
238  plasma_ziplap2tile( descB, B, NB, NB, LDB, NRHS, 0, 0, N, NRHS);
239 
240  descX = plasma_desc_init(
241  PlasmaComplexDouble, NB, NB, (NB*NB),
242  LDX, NRHS, 0, 0, N, NRHS);
243  descX.mat = X;
244  }
245 
246  /* Call the native interface */
247  status = PLASMA_zcposv_Tile_Async(uplo, &descA, &descB, &descX, ITER, sequence, &request);
248 
249  if (status == PLASMA_SUCCESS) {
251  plasma_zooptile2lap( descX, X, NB, NB, LDX, NRHS );
253  plasma_desc_mat_free(&descA);
254  plasma_desc_mat_free(&descB);
255  plasma_desc_mat_free(&descX);
256  } else {
257  plasma_ziptile2lap( descA, A, NB, NB, LDA, N );
258  plasma_ziptile2lap( descB, B, NB, NB, LDB, NRHS );
259  plasma_ziptile2lap( descX, X, NB, NB, LDX, NRHS );
261  }
262  }
263 
264  status = sequence->status;
265  plasma_sequence_destroy(plasma, sequence);
266  return status;
267 }
268 
269 
270 /***************************************************************************/
324  PLASMA_desc *X, int *ITER)
325 {
327  PLASMA_sequence *sequence = NULL;
329  int status;
330 
331  plasma = plasma_context_self();
332  if (plasma == NULL) {
333  plasma_fatal_error("PLASMA_zcposv_Tile", "PLASMA not initialized");
335  }
336  plasma_sequence_create(plasma, &sequence);
337  status = PLASMA_zcposv_Tile_Async(uplo, A, B, X, ITER, sequence, &request);
338  if (status != PLASMA_SUCCESS)
339  return status;
341  status = sequence->status;
342  plasma_sequence_destroy(plasma, sequence);
343  return status;
344 }
345 
346 /***************************************************************************/
375  PLASMA_desc *X, int *ITER,
376  PLASMA_sequence *sequence, PLASMA_request *request)
377 {
378  int N, NB;
379  PLASMA_desc descA = *A;
380  PLASMA_desc descB = *B;
381  PLASMA_desc descX = *X;
383  double *work;
384  PLASMA_desc descR, descSA, descSX;
385 
386  const int itermax = 30;
387  const double bwdmax = 1.0;
388  const PLASMA_Complex64_t negone = -1.0;
389  const PLASMA_Complex64_t one = 1.0;
390  int iiter;
391  double Anorm, cte, eps, Rnorm, Xnorm;
392  *ITER=0;
393 
394  plasma = plasma_context_self();
395  if (plasma == NULL) {
396  plasma_fatal_error("PLASMA_zcposv_Tile_Async", "PLASMA not initialized");
398  }
399  if (sequence == NULL) {
400  plasma_fatal_error("PLASMA_zcposv_Tile_Async", "NULL sequence");
401  return PLASMA_ERR_UNALLOCATED;
402  }
403  if (request == NULL) {
404  plasma_fatal_error("PLASMA_zcposv_Tile_Async", "NULL request");
405  return PLASMA_ERR_UNALLOCATED;
406  }
407  /* Check sequence status */
408  if (sequence->status == PLASMA_SUCCESS)
409  request->status = PLASMA_SUCCESS;
410  else
411  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
412 
413  /* Check descriptors for correctness */
414  if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
415  plasma_error("PLASMA_zcposv_Tile_Async", "invalid first descriptor");
417  }
418  if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
419  plasma_error("PLASMA_zcposv_Tile_Async", "invalid second descriptor");
421  }
422  if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
423  plasma_error("PLASMA_zcposv_Tile_Async", "invalid third descriptor");
425  }
426  /* Check input arguments */
427  if (descA.nb != descA.mb || descB.nb != descB.mb || descX.nb != descX.mb) {
428  plasma_error("PLASMA_zcposv_Tile_Async", "only square tiles supported");
430  }
431  if (uplo != PlasmaUpper && uplo != PlasmaLower) {
432  plasma_error("PLASMA_zcposv_Tile_Async", "illegal value of uplo");
433  return -1;
434  }
435  /* Quick return - currently NOT equivalent to LAPACK's
436  * LAPACK does not have such check for DPOSV */
437 
438 /*
439  if (min(N, NRHS) == 0)
440  return PLASMA_SUCCESS;
441 */
442 
443  /* Set N, NRHS */
444  N = descA.m;
445  NB = descA.nb;
446 
447  work = (double *)plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble);
448  if (work == NULL) {
449  plasma_error("PLASMA_zcposv_Tile_Async", "plasma_shared_alloc() failed");
450  plasma_shared_free(plasma, work);
452  }
453 
454  plasma_zdesc_alloc( descR, NB, NB, descB.m, descB.n, 0, 0, descB.m, descB.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR) );
455  plasma_cdesc_alloc( descSA, NB, NB, descA.m, descA.n, 0, 0, descA.m, descA.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR); plasma_desc_mat_free(&descSA) );
456  plasma_cdesc_alloc( descSX, NB, NB, descX.m, descX.n, 0, 0, descX.m, descX.n, plasma_shared_free( plasma, work ); plasma_desc_mat_free(&descR); plasma_desc_mat_free(&descSA); plasma_desc_mat_free(&descSX) );
457 
458  /* Compute some constants */
459  PLASMA_zlanhe(PlasmaInfNorm, uplo, descA, Anorm, work);
460  eps = LAPACKE_dlamch_work('e');
461 
462  /* Convert B from double precision to single precision and store
463  the result in SX. */
464  PLASMA_zlag2c(descB, descSX);
465  if (sequence->status != PLASMA_SUCCESS)
466  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
467 
468  /* Convert A from double precision to single precision and store
469  the result in SA. */
470  PLASMA_zlag2c(descA, descSA);
471  if (sequence->status != PLASMA_SUCCESS)
472  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
473 
474  /* Compute the Cholesky factorization of SA */
476  PLASMA_enum, uplo,
477  PLASMA_desc, descSA,
478  PLASMA_sequence*, sequence,
479  PLASMA_request*, request);
480 
481  /* Solve the system SA*SX = SB */
482  /* Forward substitution */
485  PLASMA_enum, uplo,
488  PLASMA_Complex32_t, 1.0,
489  PLASMA_desc, descSA,
490  PLASMA_desc, descSX,
491  PLASMA_sequence*, sequence,
492  PLASMA_request*, request);
493 
494  /* Backward substitution */
497  PLASMA_enum, uplo,
498  PLASMA_enum, uplo == PlasmaUpper ? PlasmaNoTrans : PlasmaConjTrans,
500  PLASMA_Complex32_t, 1.0,
501  PLASMA_desc, descSA,
502  PLASMA_desc, descSX,
503  PLASMA_sequence*, sequence,
504  PLASMA_request*, request);
505 
506  /* Convert SX back to double precision */
507  PLASMA_clag2z(descSX, descX);
508 
509  /* Compute R = B - AX. */
510  PLASMA_zlacpy(descB,descR);
513  PLASMA_enum, uplo,
514  PLASMA_Complex64_t, negone,
515  PLASMA_desc, descA,
516  PLASMA_desc, descX,
517  PLASMA_Complex64_t, one,
518  PLASMA_desc, descR,
519  PLASMA_sequence*, sequence,
520  PLASMA_request*, request);
521 
522  /* Check whether the NRHS normwise backward error satisfies the
523  stopping criterion. If yes return. Note that ITER=0 (already set). */
524  PLASMA_zlange(PlasmaInfNorm, descX, Xnorm, work);
525  PLASMA_zlange(PlasmaInfNorm, descR, Rnorm, work);
526 
527  /* Wait the end of Anorm, Xnorm and Bnorm computations */
529 
530  cte = Anorm*eps*((double) N)*bwdmax;
531  if (Rnorm < Xnorm * cte){
532  /* The NRHS normwise backward errors satisfy the
533  stopping criterion. We are good to exit. */
534  plasma_desc_mat_free(&descSA);
535  plasma_desc_mat_free(&descSX);
536  plasma_desc_mat_free(&descR);
537  plasma_shared_free(plasma, work);
538  return PLASMA_SUCCESS;
539  }
540 
541  /* Iterative refinement */
542  for (iiter = 0; iiter < itermax; iiter++){
543 
544  /* Convert R from double precision to single precision
545  and store the result in SX. */
546  PLASMA_zlag2c(descR, descSX);
547 
548  /* Solve the system SA*SX = SR */
549  /* Forward substitution */
552  PLASMA_enum, uplo,
553  PLASMA_enum, uplo == PlasmaUpper ? PlasmaConjTrans : PlasmaNoTrans,
555  PLASMA_Complex32_t, 1.0,
556  PLASMA_desc, descSA,
557  PLASMA_desc, descSX,
558  PLASMA_sequence*, sequence,
559  PLASMA_request*, request);
560 
561  /* Backward substitution */
564  PLASMA_enum, uplo,
565  PLASMA_enum, uplo == PlasmaUpper ? PlasmaNoTrans : PlasmaConjTrans,
567  PLASMA_Complex32_t, 1.0,
568  PLASMA_desc, descSA,
569  PLASMA_desc, descSX,
570  PLASMA_sequence*, sequence,
571  PLASMA_request*, request);
572 
573  /* Convert SX back to double precision and update the current
574  iterate. */
575  PLASMA_clag2z(descSX, descR);
576  PLASMA_zgeadd(one, descR, descX);
577 
578  /* Compute R = B - AX. */
579  PLASMA_zlacpy(descB,descR);
582  PLASMA_enum, uplo,
583  PLASMA_Complex64_t, negone,
584  PLASMA_desc, descA,
585  PLASMA_desc, descX,
586  PLASMA_Complex64_t, one,
587  PLASMA_desc, descR,
588  PLASMA_sequence*, sequence,
589  PLASMA_request*, request);
590 
591  /* Check whether the NRHS normwise backward errors satisfy the
592  stopping criterion. If yes, set ITER=IITER>0 and return. */
593  PLASMA_zlange(PlasmaInfNorm, descX, Xnorm, work);
594  PLASMA_zlange(PlasmaInfNorm, descR, Rnorm, work);
595 
596  /* Wait the end of Xnorm and Bnorm computations */
598 
599  if (Rnorm < Xnorm * cte){
600  /* The NRHS normwise backward errors satisfy the
601  stopping criterion. We are good to exit. */
602  *ITER = iiter;
603 
604  plasma_desc_mat_free(&descSA);
605  plasma_desc_mat_free(&descSX);
606  plasma_desc_mat_free(&descR);
607  plasma_shared_free(plasma, work);
608  return PLASMA_SUCCESS;
609  }
610  }
611 
612  /* We have performed ITER=itermax iterations and never satisified
613  the stopping criterion, set up the ITER flag accordingly and
614  follow up on double precision routine. */
615  *ITER = -itermax - 1;
616 
617  plasma_desc_mat_free(&descSA);
618  plasma_desc_mat_free(&descSX);
619  plasma_desc_mat_free(&descR);
620  plasma_shared_free(plasma, work);
621 
622  /* Single-precision iterative refinement failed to converge to a
623  satisfactory solution, so we resort to double precision. */
624 
626  PLASMA_enum, uplo,
627  PLASMA_desc, descA,
628  PLASMA_sequence*, sequence,
629  PLASMA_request*, request);
630 
631  PLASMA_zlacpy(descB,descX);
632 
635  PLASMA_enum, uplo,
636  PLASMA_enum, uplo == PlasmaUpper ? PlasmaConjTrans : PlasmaNoTrans,
638  PLASMA_Complex64_t, 1.0,
639  PLASMA_desc, descA,
640  PLASMA_desc, descX,
641  PLASMA_sequence*, sequence,
642  PLASMA_request*, request);
643 
646  PLASMA_enum, uplo,
647  PLASMA_enum, uplo == PlasmaUpper ? PlasmaNoTrans : PlasmaConjTrans,
649  PLASMA_Complex64_t, 1.0,
650  PLASMA_desc, descA,
651  PLASMA_desc, descX,
652  PLASMA_sequence*, sequence,
653  PLASMA_request*, request);
654 
655  return PLASMA_SUCCESS;
656 }