21 #define PLASMA_zlag2c(_descA, _descSB) \
22 plasma_parallel_call_4(plasma_pzlag2c, \
23 PLASMA_desc, (_descA), \
24 PLASMA_desc, (_descSB), \
25 PLASMA_sequence*, sequence, \
26 PLASMA_request*, request)
28 #define PLASMA_clag2z(_descSA, _descB) \
29 plasma_parallel_call_4(plasma_pclag2z, \
30 PLASMA_desc, (_descSA), \
31 PLASMA_desc, (_descB), \
32 PLASMA_sequence*, sequence, \
33 PLASMA_request*, request)
35 #define PLASMA_zlange(_norm, _descA, _result, _work) \
37 plasma_parallel_call_6(plasma_pzlange, \
38 PLASMA_enum, (_norm), \
39 PLASMA_desc, (_descA), \
41 double*, &(_result), \
42 PLASMA_sequence*, sequence, \
43 PLASMA_request*, request);
45 #define PLASMA_zlacpy(_descA, _descB) \
46 plasma_parallel_call_5(plasma_pzlacpy, \
47 PLASMA_enum, PlasmaUpperLower, \
48 PLASMA_desc, (_descA), \
49 PLASMA_desc, (_descB), \
50 PLASMA_sequence*, sequence, \
51 PLASMA_request*, request)
53 #define PLASMA_zgeadd(_alpha, _descA, _descB) \
54 plasma_parallel_call_5(plasma_pzgeadd, \
55 PLASMA_Complex64_t, (_alpha), \
56 PLASMA_desc, (_descA), \
57 PLASMA_desc, (_descB), \
58 PLASMA_sequence*, sequence, \
59 PLASMA_request*, request)
172 int NB, NBNB, MT, NT, NTRHS;
183 if (plasma == NULL) {
189 plasma_error(
"PLASMA_zcgels",
"only PlasmaNoTrans supported");
204 if (LDA <
max(1, M)) {
208 if (LDB <
max(1,
max(M, N))) {
212 if (LDX <
max(1,
max(M, N))) {
217 if (
min(M,
min(N, NRHS)) == 0) {
218 for (i = 0; i <
max(M, N); i++)
219 for (j = 0; j < NRHS; j++)
234 NT = (N%NB==0) ? (N/NB) : (N/NB+1);
235 MT = (M%NB==0) ? (M/NB) : (M/NB+1);
236 NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
237 printf(
"M %d, N %d, NRHS %d, NB %d, MT %d, NT %d, NTRHS %d\n", M, N, NRHS, NB, MT, NT, NTRHS);
250 M, NRHS, 0, 0, M, NRHS);
255 M, NRHS, 0, 0, M, NRHS);
262 N, NRHS, 0, 0, N, NRHS);
267 N, NRHS, 0, 0, N, NRHS);
273 plasma_error(
"PLASMA_zcgels",
"plasma_shared_alloc() failed");
399 if (plasma == NULL) {
408 status = sequence->
status;
445 int M, N, NRHS, NB, NBNB, MT, NT, NTRHS;
453 const int itermax = 30;
454 const double bwdmax = 1.0;
458 double Anorm, cte, eps, Rnorm, Xnorm;
462 if (plasma == NULL) {
466 if (sequence == NULL) {
470 if (request == NULL) {
482 plasma_error(
"PLASMA_zcgels_Tile",
"invalid first descriptor");
486 plasma_error(
"PLASMA_zcgels_Tile",
"invalid second descriptor");
490 plasma_error(
"PLASMA_zcgels_Tile",
"invalid third descriptor");
494 plasma_error(
"PLASMA_zcgels_Tile",
"invalid fourth descriptor");
498 if (descA.
nb != descA.
mb || descB.
nb != descB.
mb || descX.
nb != descX.
mb) {
499 plasma_error(
"PLASMA_zcgels_Tile",
"only square tiles supported");
503 plasma_error(
"PLASMA_zcgels_Tile",
"only PlasmaNoTrans supported");
525 MT = (M%NB==0) ? (M/NB) : (M/NB+1);
526 NT = (N%NB==0) ? (N/NB) : (N/NB+1);
527 NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
528 printf(
"M %d, N %d, NRHS %d, NB %d, MT %d, NT %d, NTRHS %d\n", M, N, NRHS, NB, MT, NT, NTRHS);
532 plasma_error(
"PLASMA_zcgesv",
"plasma_shared_alloc() failed");
540 M, NRHS, 0, 0, M, NRHS);
543 plasma_error(
"PLASMA_zcgesv",
"plasma_shared_alloc() failed");
562 M, NRHS, 0, 0, M, NRHS);
566 plasma_error(
"PLASMA_zcgesv",
"plasma_shared_alloc() failed");
577 eps = LAPACKE_dlamch_work(
'e');
579 printf(
"Anorm=%e, cte=%e\n", Anorm, cte);
593 if (descSA.
m >= descSA.
n) {
596 printf(
"Facto\n"); fflush(stdout);
603 printf(
"Solve\n"); fflush(stdout);
657 printf(
"R = B - Ax\n"); fflush(stdout);
658 printf(
"R = B - Ax ... cpy\n"); fflush(stdout);
660 printf(
"R = B - Ax ... gemm\n"); fflush(stdout);
675 printf(
"Norm of X and R\n"); fflush(stdout);
682 cte = Anorm*eps*((double) N)*bwdmax;
683 if (Rnorm < Xnorm * cte){
694 printf(
"Rnorm=%e, Xnorm * cte=%e, Rnorm=%e, cte=%e\n", Rnorm, Xnorm * cte, Rnorm, cte);
697 for (iiter = 0; iiter < itermax; iiter++){
704 if (descSA.
m >= descSA.
n) {
770 printf(
"Rnorm=%e, Xnorm * cte=%e, Rnorm=%e, cte=%e\n", Rnorm, Xnorm * cte, Rnorm, cte);
772 if (Rnorm < Xnorm * cte){
789 *ITER = -itermax - 1;
797 printf(
"Go back DOUBLE\n");
805 if (descA.
m >= descA.
n) {