PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pztrsm.c
Go to the documentation of this file.
1 
17 #include "common.h"
18 
19 #define A(m,n) BLKADDR(A, PLASMA_Complex64_t, m, n)
20 #define B(m,n) BLKADDR(B, PLASMA_Complex64_t, m, n)
21 /***************************************************************************/
25 {
30  PLASMA_Complex64_t alpha;
31  PLASMA_desc A;
32  PLASMA_desc B;
33  PLASMA_sequence *sequence;
34  PLASMA_request *request;
35 
36  int k, m, n;
37  int next_k;
38  int next_m;
39  int next_n;
40  int lda, ldb;
41  int tempkm, tempnn, tempmm, tempkn;
42 
45  PLASMA_Complex64_t lalpha;
46  PLASMA_Complex64_t minvalpha;
47 
48  plasma_unpack_args_9(side, uplo, trans, diag, alpha, A, B, sequence, request);
49  minvalpha = mzone / alpha;
50  if (sequence->status != PLASMA_SUCCESS)
51  return;
52  ss_init(B.mt, B.nt, -1);
53  /*
54  * PlasmaLeft
55  */
56  if (side == PlasmaLeft) {
57  k = 0;
58  m = PLASMA_RANK;
59  while (m >= B.mt) {
60  k++;
61  m = m - B.mt + k;
62  }
63  n = 0;
64 
65  while (k < B.mt && m < B.mt) {
66  next_n = n;
67  next_m = m;
68  next_k = k;
69 
70  next_n++;
71  if (next_n >= B.nt) {
72  next_m += PLASMA_SIZE;
73  while (next_m >= B.mt && next_k < B.mt) {
74  next_k++;
75  next_m = next_m - B.mt + next_k;
76  }
77  next_n = 0;
78  }
79 
80  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
81  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
82 
83  lalpha = k == 0 ? alpha : zone;
84  if (m == k) {
85  ss_cond_wait(m, n, k-1);
86  /*
87  * PlasmaLeft / PlasmaLower / PlasmaNoTrans
88  * PlasmaLeft / PlasmaUpper / Plasma[Conj]Trans
89  */
90  if ((uplo == PlasmaLower && trans == PlasmaNoTrans)
91  || (uplo == PlasmaUpper && trans != PlasmaNoTrans)) {
92  tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
93  lda = BLKLDD(A, k);
94  ldb = BLKLDD(B, k);
95  CORE_ztrsm(
96  side, uplo, trans, diag,
97  tempkm, tempnn,
98  lalpha, A(k, k), lda,
99  B(k, n), ldb);
100  }
101  /*
102  * PlasmaLeft / PlasmaLower / Plasma[Cojn]Trans
103  * PlasmaLeft / PlasmaUpper / PlasmaNoTrans
104  */
105  else {
106  tempkm = k == 0 ? B.m-(B.mt-1)*B.mb : B.mb;
107  lda = BLKLDD(A, B.mt-1-k);
108  ldb = BLKLDD(B, B.mt-1-k);
109  CORE_ztrsm(
110  side, uplo, trans, diag,
111  tempkm, tempnn,
112  lalpha, A(B.mt-1-k, B.mt-1-k), lda,
113  B(B.mt-1-k, n ), ldb);
114  }
115  ss_cond_set(k, n, k);
116  }
117  else {
118  ss_cond_wait(k, n, k);
119  ss_cond_wait(m, n, k-1);
120  /*
121  * PlasmaRight / PlasmaLower / PlasmaNoTrans
122  */
123  if (uplo == PlasmaLower) {
124  if (trans == PlasmaNoTrans) {
125  lda = BLKLDD(A, m);
126  ldb = BLKLDD(B, m);
127  CORE_zgemm(
129  tempmm, tempnn, B.mb,
130  mzone, A(m, k), lda,
131  B(k, n), B.mb,
132  lalpha, B(m, n), ldb);
133  }
134  /*
135  * PlasmaRight / PlasmaLower / Plasma[Conj]Trans
136  */
137  else {
138  tempkm = k == 0 ? A.m-(A.mt-1)*A.mb : A.mb;
139  lda = BLKLDD(A, B.mt-1-k);
140  ldb = BLKLDD(B, B.mt-1-k);
141  CORE_zgemm(
142  trans, PlasmaNoTrans,
143  B.mb, tempnn, tempkm,
144  mzone, A(A.mt-1-k, A.mt-1-m), lda,
145  B(B.mt-1-k, n ), ldb,
146  lalpha, B(B.mt-1-m, n ), B.mb);
147  }
148  }
149  else {
150  /*
151  * PlasmaRight / PlasmaUpper / PlasmaNoTrans
152  */
153  if (trans == PlasmaNoTrans) {
154  tempkm = k == 0 ? A.m-(A.mt-1)*A.mb : A.mb;
155  ldb = BLKLDD(B, B.mt-1-k);
156  CORE_zgemm(
158  B.mb, tempnn, tempkm,
159  mzone, A(A.mt-1-m, A.mt-1-k), A.mb,
160  B(B.mt-1-k, n ), ldb,
161  lalpha, B(B.mt-1-m, n ), B.mb);
162  }
163  /*
164  * PlasmaRight / PlasmaUpper / Plasma[Conj]Trans
165  */
166  else {
167  ldb = BLKLDD(B, m);
168  CORE_zgemm(
169  trans, PlasmaNoTrans,
170  tempmm, tempnn, B.mb,
171  mzone, A(k, m), A.mb,
172  B(k, n), B.mb,
173  lalpha, B(m, n), ldb);
174  }
175  }
176  ss_cond_set(m, n, k);
177  }
178  n = next_n;
179  m = next_m;
180  k = next_k;
181  }
182  }
183  /*
184  * PlasmaRight
185  */
186  else {
187  k = 0;
188  n = PLASMA_RANK;
189  while (n >= B.nt) {
190  k++;
191  n = n - B.nt + k;
192  }
193  m = 0;
194 
195  while (k < B.nt && n < B.nt) {
196  next_n = n;
197  next_m = m;
198  next_k = k;
199 
200  next_m++;
201  if (next_m >= B.mt) {
202  next_n += PLASMA_SIZE;
203  while (next_n >= B.nt && next_k < B.nt) {
204  next_k++;
205  next_n = next_n - B.nt + next_k;
206  }
207  next_m = 0;
208  }
209 
210  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
211  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
212 
213  lalpha = k == 0 ? alpha : zone;
214  if (n == k) {
215  ss_cond_wait(m, n, k-1);
216  /*
217  * PlasmaRight / PlasmaLower / PlasmaNoTrans
218  */
219  if (uplo == PlasmaLower) {
220  if (trans == PlasmaNoTrans) {
221  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
222  lda = BLKLDD(A, B.nt-1-k);
223  ldb = BLKLDD(B, m);
224  CORE_ztrsm(
225  side, uplo, trans, diag,
226  tempmm, tempkn,
227  lalpha, A(B.nt-1-k, B.nt-1-k), lda,
228  B(m, B.nt-1-k), ldb);
229  }
230  /*
231  * PlasmaRight / PlasmaLower / Plasma[Conj]Trans
232  */
233  else {
234  tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
235  lda = BLKLDD(A, k);
236  ldb = BLKLDD(B, m);
237  CORE_ztrsm(
238  side, uplo, trans, diag,
239  tempmm, tempkn,
240  alpha, A(k, k), lda,
241  B(m, k), ldb);
242  }
243  }
244  else {
245  /*
246  * PlasmaRight / PlasmaUpper / PlasmaNoTrans
247  */
248  if (trans == PlasmaNoTrans) {
249  tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
250  lda = BLKLDD(A, k);
251  ldb = BLKLDD(B, m);
252  CORE_ztrsm(
253  side, uplo, trans, diag,
254  tempmm, tempkn,
255  lalpha, A(k, k), lda,
256  B(m, k), ldb);
257  }
258  /*
259  * PlasmaRight / PlasmaUpper / Plasma[Conj]Trans
260  */
261  else {
262  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
263  lda = BLKLDD(A, B.nt-1-k);
264  ldb = BLKLDD(B, m);
265  CORE_ztrsm(
266  side, uplo, trans, diag,
267  tempmm, tempkn,
268  alpha, A(B.nt-1-k, B.nt-1-k), lda,
269  B(m, B.nt-1-k), ldb);
270  }
271  }
272  ss_cond_set(m, k, k);
273  }
274  else {
275  ss_cond_wait(m, k, k);
276  ss_cond_wait(m, n, k-1);
277  /*
278  * PlasmaRight / PlasmaLower / PlasmaNoTrans
279  */
280  if (uplo == PlasmaLower) {
281  if (trans == PlasmaNoTrans) {
282  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
283  lda = BLKLDD(A, B.nt-1-k);
284  ldb = BLKLDD(B, m);
285  CORE_zgemm(
287  tempmm, B.mb, tempkn,
288  mzone, B(m, B.nt-1-k), ldb,
289  A(B.nt-1-k, B.nt-1-n), lda,
290  lalpha, B(m, B.nt-1-n), ldb);
291  }
292  /*
293  * PlasmaRight / PlasmaLower / Plasma[Conj]Trans
294  */
295  else {
296  lda = BLKLDD(A, n);
297  ldb = BLKLDD(B, m);
298  CORE_zgemm(
299  PlasmaNoTrans, trans,
300  tempmm, tempnn, B.mb,
301  minvalpha, B(m, k), ldb,
302  A(n, k), lda,
303  zone, B(m, n), ldb);
304  }
305  }
306  else {
307  /*
308  * PlasmaRight / PlasmaUpper / PlasmaNoTrans
309  */
310  if (trans == PlasmaNoTrans) {
311  lda = BLKLDD(A, k);
312  ldb = BLKLDD(B, m);
313  CORE_zgemm(
315  tempmm, tempnn, B.mb,
316  mzone, B(m, k), ldb,
317  A(k, n), lda,
318  lalpha, B(m, n), ldb);
319  }
320  /*
321  * PlasmaRight / PlasmaUpper / Plasma[Conj]Trans
322  */
323  else {
324  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
325  ldb = BLKLDD(B, m);
326  CORE_zgemm(
327  PlasmaNoTrans, trans,
328  tempmm, B.nb, tempkn,
329  minvalpha, B(m, B.nt-1-k), ldb,
330  A(B.nt-1-n, B.nt-1-k), A.mb,
331  zone, B(m, B.nt-1-n), ldb);
332  }
333  }
334  ss_cond_set(m, n, k);
335  }
336  n = next_n;
337  m = next_m;
338  k = next_k;
339  }
340  }
341  ss_finalize();
342 }
343 
344 /***************************************************************************/
349  PLASMA_sequence *sequence, PLASMA_request *request)
350 {
353 
354  int k, m, n;
355  int lda, ldan, ldb;
356  int tempkm, tempkn, tempmm, tempnn;
357 
360  PLASMA_Complex64_t minvalpha = (PLASMA_Complex64_t)-1.0 / alpha;
361  PLASMA_Complex64_t lalpha;
362 
363  plasma = plasma_context_self();
364  if (sequence->status != PLASMA_SUCCESS)
365  return;
366  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
367  /*
368  * PlasmaLeft / PlasmaUpper / PlasmaNoTrans
369  */
370  if (side == PlasmaLeft) {
371  if (uplo == PlasmaUpper) {
372  if (trans == PlasmaNoTrans) {
373  for (k = 0; k < B.mt; k++) {
374  tempkm = k == 0 ? B.m-(B.mt-1)*B.mb : B.mb;
375  lda = BLKLDD(A, B.mt-1-k);
376  ldb = BLKLDD(B, B.mt-1-k);
377  lalpha = k == 0 ? alpha : zone;
378  for (n = 0; n < B.nt; n++) {
379  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
381  plasma->quark, &task_flags,
382  side, uplo, trans, diag,
383  tempkm, tempnn, A.mb,
384  lalpha, A(B.mt-1-k, B.mt-1-k), lda, /* lda * tempkm */
385  B(B.mt-1-k, n), ldb); /* ldb * tempnn */
386  }
387  for (m = k+1; m < B.mt; m++) {
388  for (n = 0; n < B.nt; n++) {
389  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
391  plasma->quark, &task_flags,
393  B.mb, tempnn, tempkm, A.mb,
394  mzone, A(B.mt-1-m, B.mt-1-k), A.mb,
395  B(B.mt-1-k, n ), ldb,
396  lalpha, B(B.mt-1-m, n ), B.mb);
397  }
398  }
399  }
400  }
401  /*
402  * PlasmaLeft / PlasmaUpper / Plasma[Conj]Trans
403  */
404  else {
405  for (k = 0; k < B.mt; k++) {
406  tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
407  lda = BLKLDD(A, k);
408  ldb = BLKLDD(B, k);
409  lalpha = k == 0 ? alpha : zone;
410  for (n = 0; n < B.nt; n++) {
411  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
413  plasma->quark, &task_flags,
414  side, uplo, trans, diag,
415  tempkm, tempnn, A.mb,
416  lalpha, A(k, k), lda,
417  B(k, n), ldb);
418  }
419  for (m = k+1; m < B.mt; m++) {
420  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
421  ldb = BLKLDD(B, m);
422  for (n = 0; n < B.nt; n++) {
423  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
425  plasma->quark, &task_flags,
426  trans, PlasmaNoTrans,
427  tempmm, tempnn, B.mb, A.mb,
428  mzone, A(k, m), A.mb,
429  B(k, n), B.mb,
430  lalpha, B(m, n), ldb);
431  }
432  }
433  }
434  }
435  }
436  /*
437  * PlasmaLeft / PlasmaLower / PlasmaNoTrans
438  */
439  else {
440  if (trans == PlasmaNoTrans) {
441  for (k = 0; k < B.mt; k++) {
442  tempkm = k == B.mt-1 ? B.m-k*B.mb : B.mb;
443  lda = BLKLDD(A, k);
444  ldb = BLKLDD(B, k);
445  lalpha = k == 0 ? alpha : zone;
446  for (n = 0; n < B.nt; n++) {
447  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
449  plasma->quark, &task_flags,
450  side, uplo, trans, diag,
451  tempkm, tempnn, A.mb,
452  lalpha, A(k, k), lda,
453  B(k, n), ldb);
454  }
455  for (m = k+1; m < B.mt; m++) {
456  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
457  lda = BLKLDD(A, m);
458  ldb = BLKLDD(B, m);
459  for (n = 0; n < B.nt; n++) {
460  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
462  plasma->quark, &task_flags,
464  tempmm, tempnn, B.mb, A.mb,
465  mzone, A(m, k), lda,
466  B(k, n), B.mb,
467  lalpha, B(m, n), ldb);
468  }
469  }
470  }
471  }
472  /*
473  * PlasmaLeft / PlasmaLower / Plasma[Conj]Trans
474  */
475  else {
476  for (k = 0; k < B.mt; k++) {
477  tempkm = k == 0 ? B.m-(B.mt-1)*B.mb : B.mb;
478  lda = BLKLDD(A, B.mt-1-k);
479  ldb = BLKLDD(B, B.mt-1-k);
480  lalpha = k == 0 ? alpha : zone;
481  for (n = 0; n < B.nt; n++) {
482  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
484  plasma->quark, &task_flags,
485  side, uplo, trans, diag,
486  tempkm, tempnn, A.mb,
487  lalpha, A(B.mt-1-k, B.mt-1-k), lda,
488  B(B.mt-1-k, n), ldb);
489  }
490  for (m = k+1; m < B.mt; m++) {
491  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
492  for (n = 0; n < B.nt; n++) {
493  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
495  plasma->quark, &task_flags,
496  trans, PlasmaNoTrans,
497  B.mb, tempnn, tempkm, A.mb,
498  mzone, A(B.mt-1-k, B.mt-1-m), lda,
499  B(B.mt-1-k, n ), ldb,
500  lalpha, B(B.mt-1-m, n ), B.mb);
501  }
502  }
503  }
504  }
505  }
506  }
507  /*
508  * PlasmaRight / PlasmaUpper / PlasmaNoTrans
509  */
510  else {
511  if (uplo == PlasmaUpper) {
512  if (trans == PlasmaNoTrans) {
513  for (k = 0; k < B.nt; k++) {
514  tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
515  lda = BLKLDD(A, k);
516  lalpha = k == 0 ? alpha : zone;
517  for (m = 0; m < B.mt; m++) {
518  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
519  ldb = BLKLDD(B, m);
521  plasma->quark, &task_flags,
522  side, uplo, trans, diag,
523  tempmm, tempkn, A.mb,
524  lalpha, A(k, k), lda, /* lda * tempkn */
525  B(m, k), ldb); /* ldb * tempkn */
526  }
527  for (m = 0; m < B.mt; m++) {
528  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
529  ldb = BLKLDD(B, m);
530  for (n = k+1; n < B.nt; n++) {
531  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
533  plasma->quark, &task_flags,
535  tempmm, tempnn, B.mb, A.mb,
536  mzone, B(m, k), ldb, /* ldb * B.mb */
537  A(k, n), lda, /* lda * tempnn */
538  lalpha, B(m, n), ldb); /* ldb * tempnn */
539  }
540  }
541  }
542  }
543  /*
544  * PlasmaRight / PlasmaUpper / Plasma[Conj]Trans
545  */
546  else {
547  for (k = 0; k < B.nt; k++) {
548  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
549  lda = BLKLDD(A, B.nt-1-k);
550  for (m = 0; m < B.mt; m++) {
551  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
552  ldb = BLKLDD(B, m);
554  plasma->quark, &task_flags,
555  side, uplo, trans, diag,
556  tempmm, tempkn, A.mb,
557  alpha, A(B.nt-1-k, B.nt-1-k), lda, /* lda * tempkn */
558  B( m, B.nt-1-k), ldb); /* ldb * tempkn */
559 
560  for (n = k+1; n < B.nt; n++) {
562  plasma->quark, &task_flags,
563  PlasmaNoTrans, trans,
564  tempmm, B.nb, tempkn, A.mb,
565  minvalpha, B(m, B.nt-1-k), ldb, /* ldb * tempkn */
566  A(B.nt-1-n, B.nt-1-k), A.mb, /* A.mb * tempkn (Never last row) */
567  zone, B(m, B.nt-1-n), ldb); /* ldb * B.nb */
568  }
569  }
570  }
571  }
572  }
573  /*
574  * PlasmaRight / PlasmaLower / PlasmaNoTrans
575  */
576  else {
577  if (trans == PlasmaNoTrans) {
578  for (k = 0; k < B.nt; k++) {
579  tempkn = k == 0 ? B.n-(B.nt-1)*B.nb : B.nb;
580  lda = BLKLDD(A, B.nt-1-k);
581  lalpha = k == 0 ? alpha : zone;
582  for (m = 0; m < B.mt; m++) {
583  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
584  ldb = BLKLDD(B, m);
586  plasma->quark, &task_flags,
587  side, uplo, trans, diag,
588  tempmm, tempkn, A.mb,
589  lalpha, A(B.nt-1-k, B.nt-1-k), lda, /* lda * tempkn */
590  B( m, B.nt-1-k), ldb); /* ldb * tempkn */
591 
592  for (n = k+1; n < B.nt; n++) {
594  plasma->quark, &task_flags,
596  tempmm, B.nb, tempkn, A.mb,
597  mzone, B(m, B.nt-1-k), ldb, /* ldb * tempkn */
598  A(B.nt-1-k, B.nt-1-n), lda, /* lda * B.nb */
599  lalpha, B(m, B.nt-1-n), ldb); /* ldb * B.nb */
600  }
601  }
602  }
603  }
604  /*
605  * PlasmaRight / PlasmaLower / Plasma[Conj]Trans
606  */
607  else {
608  for (k = 0; k < B.nt; k++) {
609  tempkn = k == B.nt-1 ? B.n-k*B.nb : B.nb;
610  lda = BLKLDD(A, k);
611  for (m = 0; m < B.mt; m++) {
612  tempmm = m == B.mt-1 ? B.m-m*B.mb : B.mb;
613  ldb = BLKLDD(B, m);
615  plasma->quark, &task_flags,
616  side, uplo, trans, diag,
617  tempmm, tempkn, A.mb,
618  alpha, A(k, k), lda, /* lda * tempkn */
619  B(m, k), ldb); /* ldb * tempkn */
620 
621  for (n = k+1; n < B.nt; n++) {
622  tempnn = n == B.nt-1 ? B.n-n*B.nb : B.nb;
623  ldan = BLKLDD(A, n);
625  plasma->quark, &task_flags,
626  PlasmaNoTrans, trans,
627  tempmm, tempnn, B.mb, A.mb,
628  minvalpha, B(m, k), ldb, /* ldb * tempkn */
629  A(n, k), ldan, /* ldan * tempkn */
630  zone, B(m, n), ldb); /* ldb * tempnn */
631  }
632  }
633  }
634  }
635  }
636  }
637 }