PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zcgels.c
Go to the documentation of this file.
1 
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <math.h>
18 #include <lapacke.h>
19 #include "common.h"
20 
21 #define PLASMA_zlag2c(_descA, _descSB) \
22  plasma_parallel_call_4(plasma_pzlag2c, \
23  PLASMA_desc, (_descA), \
24  PLASMA_desc, (_descSB), \
25  PLASMA_sequence*, sequence, \
26  PLASMA_request*, request)
27 
28 #define PLASMA_clag2z(_descSA, _descB) \
29  plasma_parallel_call_4(plasma_pclag2z, \
30  PLASMA_desc, (_descSA), \
31  PLASMA_desc, (_descB), \
32  PLASMA_sequence*, sequence, \
33  PLASMA_request*, request)
34 
35 #define PLASMA_zlange(_norm, _descA, _result, _work) \
36  _result = 0; \
37  plasma_parallel_call_6(plasma_pzlange, \
38  PLASMA_enum, (_norm), \
39  PLASMA_desc, (_descA), \
40  double*, (_work), \
41  double*, &(_result), \
42  PLASMA_sequence*, sequence, \
43  PLASMA_request*, request);
44 
45 #define PLASMA_zlacpy(_descA, _descB) \
46  plasma_parallel_call_5(plasma_pzlacpy, \
47  PLASMA_enum, PlasmaUpperLower, \
48  PLASMA_desc, (_descA), \
49  PLASMA_desc, (_descB), \
50  PLASMA_sequence*, sequence, \
51  PLASMA_request*, request)
52 
53 #define PLASMA_zgeadd(_alpha, _descA, _descB) \
54  plasma_parallel_call_5(plasma_pzgeadd, \
55  PLASMA_Complex64_t, (_alpha), \
56  PLASMA_desc, (_descA), \
57  PLASMA_desc, (_descB), \
58  PLASMA_sequence*, sequence, \
59  PLASMA_request*, request)
60 
61 /***************************************************************************/
166 int PLASMA_zcgels(PLASMA_enum trans, int M, int N, int NRHS,
167  PLASMA_Complex64_t *A, int LDA,
168  PLASMA_Complex64_t *B, int LDB,
169  PLASMA_Complex64_t *X, int LDX, int *ITER)
170 {
171  int i, j;
172  int NB, NBNB, MT, NT, NTRHS;
173  int status;
174  PLASMA_desc descA;
175  PLASMA_desc descB;
176  PLASMA_desc *descT;
177  PLASMA_desc descX;
179  PLASMA_sequence *sequence = NULL;
181 
182  plasma = plasma_context_self();
183  if (plasma == NULL) {
184  plasma_fatal_error("PLASMA_zcgels", "PLASMA not initialized");
186  }
187  /* Check input arguments */
188  if (trans != PlasmaNoTrans) {
189  plasma_error("PLASMA_zcgels", "only PlasmaNoTrans supported");
191  }
192  if (M < 0) {
193  plasma_error("PLASMA_zcgels", "illegal value of M");
194  return -2;
195  }
196  if (N < 0) {
197  plasma_error("PLASMA_zcgels", "illegal value of N");
198  return -3;
199  }
200  if (NRHS < 0) {
201  plasma_error("PLASMA_zcgels", "illegal value of NRHS");
202  return -4;
203  }
204  if (LDA < max(1, M)) {
205  plasma_error("PLASMA_zcgels", "illegal value of LDA");
206  return -6;
207  }
208  if (LDB < max(1, max(M, N))) {
209  plasma_error("PLASMA_zcgels", "illegal value of LDB");
210  return -9;
211  }
212  if (LDX < max(1, max(M, N))) {
213  plasma_error("PLASMA_zcgels", "illegal value of LDX");
214  return -10;
215  }
216  /* Quick return */
217  if (min(M, min(N, NRHS)) == 0) {
218  for (i = 0; i < max(M, N); i++)
219  for (j = 0; j < NRHS; j++)
220  B[j*LDB+i] = 0.0;
221  return PLASMA_SUCCESS;
222  }
223 
224  /* Tune NB & IB depending on M, N & NRHS; Set NBNB */
225  status = plasma_tune(PLASMA_FUNC_ZCGELS, M, N, NRHS);
226  if (status != PLASMA_SUCCESS) {
227  plasma_error("PLASMA_zcgels", "plasma_tune() failed");
228  return status;
229  }
230 
231  /* Set MT, NT & NTRHS */
232  NB = PLASMA_NB;
233  NBNB = NB*NB;
234  NT = (N%NB==0) ? (N/NB) : (N/NB+1);
235  MT = (M%NB==0) ? (M/NB) : (M/NB+1);
236  NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
237  printf("M %d, N %d, NRHS %d, NB %d, MT %d, NT %d, NTRHS %d\n", M, N, NRHS, NB, MT, NT, NTRHS);
238 
239  plasma_sequence_create(plasma, &sequence);
240 
241  descA = plasma_desc_init(
243  NB, NB, NBNB,
244  M, N, 0, 0, M, N);
245 
246  if (M >= N) {
247  descB = plasma_desc_init(
249  NB, NB, NBNB,
250  M, NRHS, 0, 0, M, NRHS);
251 
252  descX = plasma_desc_init(
254  NB, NB, NBNB,
255  M, NRHS, 0, 0, M, NRHS);
256 
257  }
258  else {
259  descB = plasma_desc_init(
261  NB, NB, NBNB,
262  N, NRHS, 0, 0, N, NRHS);
263 
264  descX = plasma_desc_init(
266  NB, NB, NBNB,
267  N, NRHS, 0, 0, N, NRHS);
268  }
269 
270  /* DOUBLE PRECISION INITIALIZATION */
271  /* Allocate memory for matrices in block layout */
272  if (plasma_desc_mat_alloc(&descA) || plasma_desc_mat_alloc(&descB) || plasma_desc_mat_alloc(&descX)) {
273  plasma_error("PLASMA_zcgels", "plasma_shared_alloc() failed");
274  plasma_desc_mat_free(&descA);
275  plasma_desc_mat_free(&descB);
276  plasma_desc_mat_free(&descX);
278  }
279 
281  PLASMA_Complex64_t*, A,
282  int, LDA,
283  PLASMA_desc, descA,
284  PLASMA_sequence*, sequence,
285  PLASMA_request*, &request);
286 
288  PLASMA_Complex64_t*, B,
289  int, LDB,
290  PLASMA_desc, descB,
291  PLASMA_sequence*, sequence,
292  PLASMA_request*, &request);
293 
294  /* Allocate workspace */
295  PLASMA_Alloc_Workspace_zgels_Tile(M, N, &descT);
296 
297  /* Call the native interface */
298  status = PLASMA_zcgels_Tile_Async(PlasmaNoTrans, &descA, descT, &descB, &descX, ITER,
299  sequence, &request);
300 
301  if (status == PLASMA_SUCCESS) {
303  PLASMA_desc, descX,
304  PLASMA_Complex64_t*, X,
305  int, LDX,
306  PLASMA_sequence*, sequence,
307  PLASMA_request*, &request);
308  }
310 
312  plasma_sequence_destroy(plasma, sequence);
313  plasma_desc_mat_free(&descA);
314  plasma_desc_mat_free(&descB);
315  plasma_desc_mat_free(&descX);
316  return status;
317 }
318 
319 /***************************************************************************/
391  PLASMA_desc *B, PLASMA_desc *X, int *ITER)
392 {
394  PLASMA_sequence *sequence = NULL;
396  int status;
397 
398  plasma = plasma_context_self();
399  if (plasma == NULL) {
400  plasma_fatal_error("PLASMA_zcgels_Tile", "PLASMA not initialized");
402  }
403  plasma_sequence_create(plasma, &sequence);
404  status = PLASMA_zcgels_Tile_Async(trans, A, T, B, X, ITER, sequence, &request);
405  if (status != PLASMA_SUCCESS)
406  return status;
408  status = sequence->status;
409  plasma_sequence_destroy(plasma, sequence);
410  return status;
411 }
412 
413 /***************************************************************************/
442  PLASMA_desc *B, PLASMA_desc *X, int *ITER,
443  PLASMA_sequence *sequence, PLASMA_request *request)
444 {
445  int M, N, NRHS, NB, NBNB, MT, NT, NTRHS;
446  PLASMA_desc descA = *A;
447  PLASMA_desc descT = *T;
448  PLASMA_desc descB = *B;
449  PLASMA_desc descX = *X;
451  double *work;
452 
453  const int itermax = 30;
454  const double bwdmax = 1.0;
455  const PLASMA_Complex64_t negone = -1.0;
456  const PLASMA_Complex64_t one = 1.0;
457  int iiter;
458  double Anorm, cte, eps, Rnorm, Xnorm;
459  *ITER=0;
460 
461  plasma = plasma_context_self();
462  if (plasma == NULL) {
463  plasma_fatal_error("PLASMA_zcgels_Tile", "PLASMA not initialized");
465  }
466  if (sequence == NULL) {
467  plasma_fatal_error("PLASMA_zcgels_Tile", "NULL sequence");
468  return PLASMA_ERR_UNALLOCATED;
469  }
470  if (request == NULL) {
471  plasma_fatal_error("PLASMA_zcgels_Tile", "NULL request");
472  return PLASMA_ERR_UNALLOCATED;
473  }
474  /* Check sequence status */
475  if (sequence->status == PLASMA_SUCCESS)
476  request->status = PLASMA_SUCCESS;
477  else
478  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
479 
480  /* Check descriptors for correctness */
481  if (plasma_desc_check(&descA) != PLASMA_SUCCESS) {
482  plasma_error("PLASMA_zcgels_Tile", "invalid first descriptor");
484  }
485  if (plasma_desc_check(&descT) != PLASMA_SUCCESS) {
486  plasma_error("PLASMA_zcgels_Tile", "invalid second descriptor");
488  }
489  if (plasma_desc_check(&descB) != PLASMA_SUCCESS) {
490  plasma_error("PLASMA_zcgels_Tile", "invalid third descriptor");
492  }
493  if (plasma_desc_check(&descX) != PLASMA_SUCCESS) {
494  plasma_error("PLASMA_zcgels_Tile", "invalid fourth descriptor");
496  }
497  /* Check input arguments */
498  if (descA.nb != descA.mb || descB.nb != descB.mb || descX.nb != descX.mb) {
499  plasma_error("PLASMA_zcgels_Tile", "only square tiles supported");
501  }
502  if (trans != PlasmaNoTrans) {
503  plasma_error("PLASMA_zcgels_Tile", "only PlasmaNoTrans supported");
505  }
506  /* Quick return - currently NOT equivalent to LAPACK's:
507  if (min(M, min(N, NRHS)) == 0) {
508  for (i = 0; i < max(M, N); i++)
509  for (j = 0; j < NRHS; j++)
510  B[j*LDB+i] = 0.0;
511  return PLASMA_SUCCESS;
512  }
513  */
514 
515  if (0 == 0) {
516  // START SPECIFIC
517 
518  /* Set M, M, NRHS, NB, MT, NT & NTRHS */
519  M = descA.lm;
520  N = descA.ln;
521  NRHS = descB.ln;
522  NB = descA.nb;
523  NBNB = NB*NB;
524 
525  MT = (M%NB==0) ? (M/NB) : (M/NB+1);
526  NT = (N%NB==0) ? (N/NB) : (N/NB+1);
527  NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
528  printf("M %d, N %d, NRHS %d, NB %d, MT %d, NT %d, NTRHS %d\n", M, N, NRHS, NB, MT, NT, NTRHS);
529 
530  work = (double *)plasma_shared_alloc(plasma, PLASMA_SIZE, PlasmaRealDouble);
531  if (work == NULL) {
532  plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
533  plasma_shared_free(plasma, work);
535  }
536 
539  NB, NB, NBNB,
540  M, NRHS, 0, 0, M, NRHS);
541 
542  if (plasma_desc_mat_alloc(&descR)) {
543  plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
544  plasma_desc_mat_free(&descR);
545  plasma_shared_free(plasma, work);
547  }
548 
549  PLASMA_desc descSA = plasma_desc_init(
551  NB, NB, NBNB,
552  M, N, 0, 0, M, N);
553 
554  PLASMA_desc descST = plasma_desc_init(
556  IB, NB, IBNB,
557  M, N, 0, 0, M, N);
558 
559  PLASMA_desc descSX = plasma_desc_init(
561  NB, NB, NBNB,
562  M, NRHS, 0, 0, M, NRHS);
563 
564  /* Allocate memory for single precision matrices in block layout */
565  if (plasma_desc_mat_alloc(&descSA) || plasma_desc_mat_alloc(&descST) || plasma_desc_mat_alloc(&descSX)) {
566  plasma_error("PLASMA_zcgesv", "plasma_shared_alloc() failed");
567  plasma_desc_mat_free(&descSA);
568  plasma_desc_mat_free(&descST);
569  plasma_desc_mat_free(&descSX);
570  plasma_desc_mat_free(&descR);
571  plasma_shared_free(plasma, work);
573  }
574 
575  /* Compute some constants */
576  PLASMA_zlange(PlasmaInfNorm, descA, Anorm, work);
577  eps = LAPACKE_dlamch_work('e');
578 
579  printf("Anorm=%e, cte=%e\n", Anorm, cte);
580 
581  /* Convert B from double precision to single precision and store
582  the result in SX. */
583  PLASMA_zlag2c(descB, descSX);
584  if (sequence->status != PLASMA_SUCCESS)
585  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
586 
587  /* Convert A from double precision to single precision and store
588  the result in SA. */
589  PLASMA_zlag2c(descA, descSA);
590  if (sequence->status != PLASMA_SUCCESS)
591  return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);
592 
593  if (descSA.m >= descSA.n) {
594 
595  /* Compute the QR factorization of SA */
596  printf("Facto\n"); fflush(stdout);
598  PLASMA_desc, descSA,
599  PLASMA_desc, descST,
600  PLASMA_sequence*, sequence,
601  PLASMA_request*, request);
602 
603  printf("Solve\n"); fflush(stdout);
605  PLASMA_desc, descSA,
606  PLASMA_desc, descSX,
607  PLASMA_desc, descST,
608  PLASMA_sequence*, sequence,
609  PLASMA_request*, request);
610 
616  PLASMA_Complex32_t, 1.0,
617  PLASMA_desc, plasma_desc_submatrix(descSA, 0, 0, descSA.n, descSA.n),
618  PLASMA_desc, plasma_desc_submatrix(descSX, 0, 0, descSA.n, descSX.n),
619  PLASMA_sequence*, sequence,
620  PLASMA_request*, request);
621  }
622  else {
624  PLASMA_desc, plasma_desc_submatrix(descSX, descSA.m, 0, descSA.n-descSA.m, descSX.n),
625  PLASMA_sequence*, sequence,
626  PLASMA_request*, request);
627 
629  PLASMA_desc, descSA,
630  PLASMA_desc, descST,
631  PLASMA_sequence*, sequence,
632  PLASMA_request*, request);
633 
639  PLASMA_Complex32_t, 1.0,
640  PLASMA_desc, plasma_desc_submatrix(descSA, 0, 0, descSA.m, descSA.m),
641  PLASMA_desc, plasma_desc_submatrix(descSX, 0, 0, descSA.m, descSX.n),
642  PLASMA_sequence*, sequence,
643  PLASMA_request*, request);
644 
646  PLASMA_desc, descSA,
647  PLASMA_desc, descSX,
648  PLASMA_desc, descST,
649  PLASMA_sequence*, sequence,
650  PLASMA_request*, request);
651  }
652 
653  /* Convert SX back to double precision */
654  PLASMA_clag2z(descSX, descX);
655 
656  /* Compute R = B - AX. */
657  printf("R = B - Ax\n"); fflush(stdout);
658  printf("R = B - Ax ... cpy\n"); fflush(stdout);
659  PLASMA_zlacpy(descB,descR);
660  printf("R = B - Ax ... gemm\n"); fflush(stdout);
661 
665  PLASMA_Complex64_t, negone,
666  PLASMA_desc, descA,
667  PLASMA_desc, descX,
668  PLASMA_Complex64_t, one,
669  PLASMA_desc, descR,
670  PLASMA_sequence*, sequence,
671  PLASMA_request*, request);
672 
673  /* Check whether the NRHS normwise backward error satisfies the
674  stopping criterion. If yes return. Note that ITER=0 (already set). */
675  printf("Norm of X and R\n"); fflush(stdout);
676  PLASMA_zlange(PlasmaInfNorm, descX, Xnorm, work);
677  PLASMA_zlange(PlasmaInfNorm, descR, Rnorm, work);
678 
679  /* Wait the end of Anorm, Xnorm and Bnorm computations */
681 
682  cte = Anorm*eps*((double) N)*bwdmax;
683  if (Rnorm < Xnorm * cte){
684  /* The NRHS normwise backward errors satisfy the
685  stopping criterion. We are good to exit. */
686  plasma_desc_mat_free(&descSA);
687  plasma_desc_mat_free(&descST);
688  plasma_desc_mat_free(&descSX);
689  plasma_desc_mat_free(&descR);
690  plasma_shared_free(plasma, work);
691  return PLASMA_SUCCESS;
692  }
693 
694  printf("Rnorm=%e, Xnorm * cte=%e, Rnorm=%e, cte=%e\n", Rnorm, Xnorm * cte, Rnorm, cte);
695 
696  /* Iterative refinement */
697  for (iiter = 0; iiter < itermax; iiter++){
698 
699  /* Convert R from double precision to single precision
700  and store the result in SX. */
701  PLASMA_zlag2c(descR, descSX);
702 
703  /* Solve the system SA*SX = SR */
704  if (descSA.m >= descSA.n) {
705 
707  PLASMA_desc, descSA,
708  PLASMA_desc, descSX,
709  PLASMA_desc, descST,
710  PLASMA_sequence*, sequence,
711  PLASMA_request*, request);
712 
718  PLASMA_Complex32_t, 1.0,
719  PLASMA_desc, plasma_desc_submatrix(descSA, 0, 0, descSA.n, descSA.n),
720  PLASMA_desc, plasma_desc_submatrix(descSX, 0, 0, descSA.n, descSX.n),
721  PLASMA_sequence*, sequence,
722  PLASMA_request*, request);
723  } else {
729  PLASMA_Complex32_t, 1.0,
730  PLASMA_desc, plasma_desc_submatrix(descSA, 0, 0, descSA.m, descSA.m),
731  PLASMA_desc, plasma_desc_submatrix(descSX, 0, 0, descSA.m, descSX.n),
732  PLASMA_sequence*, sequence,
733  PLASMA_request*, request);
734 
736  PLASMA_desc, descSA,
737  PLASMA_desc, descSX,
738  PLASMA_desc, descST,
739  PLASMA_sequence*, sequence,
740  PLASMA_request*, request);
741  }
742 
743 
744  /* Convert SX back to double precision and update the current
745  iterate. */
746  PLASMA_clag2z(descSX, descR);
747  PLASMA_zgeadd(one, descR, descX);
748 
749  /* Compute R = B - AX. */
750  PLASMA_zlacpy(descB,descR);
754  PLASMA_Complex64_t, negone,
755  PLASMA_desc, descA,
756  PLASMA_desc, descX,
757  PLASMA_Complex64_t, one,
758  PLASMA_desc, descR,
759  PLASMA_sequence*, sequence,
760  PLASMA_request*, request);
761 
762  /* Check whether the NRHS normwise backward errors satisfy the
763  stopping criterion. If yes, set ITER=IITER>0 and return. */
764  PLASMA_zlange(PlasmaInfNorm, descX, Xnorm, work);
765  PLASMA_zlange(PlasmaInfNorm, descR, Rnorm, work);
766 
767  /* Wait the end of Xnorm and Bnorm computations */
769 
770  printf("Rnorm=%e, Xnorm * cte=%e, Rnorm=%e, cte=%e\n", Rnorm, Xnorm * cte, Rnorm, cte);
771 
772  if (Rnorm < Xnorm * cte){
773  /* The NRHS normwise backward errors satisfy the
774  stopping criterion. We are good to exit. */
775  *ITER = iiter;
776 
777  plasma_desc_mat_free(&descSA);
778  plasma_desc_mat_free(&descST);
779  plasma_desc_mat_free(&descSX);
780  plasma_desc_mat_free(&descR);
781  plasma_shared_free(plasma, work);
782  return PLASMA_SUCCESS;
783  }
784  }
785 
786  /* We have performed ITER=itermax iterations and never satisified
787  the stopping criterion, set up the ITER flag accordingly and
788  follow up on double precision routine. */
789  *ITER = -itermax - 1;
790 
791  plasma_desc_mat_free(&descSA);
792  plasma_desc_mat_free(&descST);
793  plasma_desc_mat_free(&descSX);
794  plasma_desc_mat_free(&descR);
795  plasma_shared_free(plasma, work);
796 
797  printf("Go back DOUBLE\n");
798  // END SPECIFIC
799  }
800 
801  /* Single-precision iterative refinement failed to converge to a
802  satisfactory solution, so we resort to double precision. */
803  PLASMA_zlacpy(descB, descX);
804 
805  if (descA.m >= descA.n) {
807  PLASMA_desc, descA,
808  PLASMA_desc, descT,
809  PLASMA_sequence*, sequence,
810  PLASMA_request*, request);
811 
813  PLASMA_desc, descA,
814  PLASMA_desc, descX,
815  PLASMA_desc, descT,
816  PLASMA_sequence*, sequence,
817  PLASMA_request*, request);
818 
824  PLASMA_Complex64_t, 1.0,
825  PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.n, descA.n),
826  PLASMA_desc, plasma_desc_submatrix(descX, 0, 0, descA.n, descX.n),
827  PLASMA_sequence*, sequence,
828  PLASMA_request*, request);
829  }
830  else {
832  PLASMA_desc, plasma_desc_submatrix(descX, descA.m, 0, descA.n-descA.m, descX.n),
833  PLASMA_sequence*, sequence,
834  PLASMA_request*, request);
835 
837  PLASMA_desc, descA,
838  PLASMA_desc, descT,
839  PLASMA_sequence*, sequence,
840  PLASMA_request*, request);
841 
847  PLASMA_Complex64_t, 1.0,
848  PLASMA_desc, plasma_desc_submatrix(descA, 0, 0, descA.m, descA.m),
849  PLASMA_desc, plasma_desc_submatrix(descX, 0, 0, descA.m, descX.n),
850  PLASMA_sequence*, sequence,
851  PLASMA_request*, request);
852 
854  PLASMA_desc, descA,
855  PLASMA_desc, descX,
856  PLASMA_desc, descT,
857  PLASMA_sequence*, sequence,
858  PLASMA_request*, request);
859  }
860  return PLASMA_SUCCESS;
861 }