PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pzher2k.c
Go to the documentation of this file.
1 
15 #include "common.h"
16 
17 #define A(m,n) BLKADDR(A, PLASMA_Complex64_t, m, n)
18 #define B(m,n) BLKADDR(B, PLASMA_Complex64_t, m, n)
19 #define C(m,n) BLKADDR(C, PLASMA_Complex64_t, m, n)
20 /***************************************************************************/
24 {
27  PLASMA_Complex64_t alpha;
28  PLASMA_desc A;
29  PLASMA_desc B;
30  double beta;
31  PLASMA_desc C;
32  PLASMA_sequence *sequence;
33  PLASMA_request *request;
34 
35  int m, n, k;
36  int next_m;
37  int next_n;
38  int ldam, ldan, ldak;
39  int ldbm, ldbn, ldbk;
40  int ldcm, ldcn;
41  int tempkn, tempkm, tempmm, tempnn;
42 
44  PLASMA_Complex64_t zbeta;
45  double dbeta;
46 
47  plasma_unpack_args_9(uplo, trans, alpha, A, B, beta, C, sequence, request);
48  if (sequence->status != PLASMA_SUCCESS)
49  return;
50 
51  n = 0;
52  m = PLASMA_RANK;
53  while (m >= C.mt && n < C.nt) {
54  n++;
55  m = m-C.mt+n;
56  }
57 
58  while (n < C.nt) {
59  next_n = n;
60  next_m = m + PLASMA_SIZE;
61  while (next_m >= C.mt && next_n < C.nt) {
62  next_n++;
63  next_m = next_m - C.mt + next_n;
64  }
65 
66  tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
67  tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
68 
69  ldcn = BLKLDD(C, n);
70  ldcm = BLKLDD(C, m);
71 
72  if (m == n) {
73  /*
74  * PlasmaNoTrans
75  */
76  if (trans == PlasmaNoTrans) {
77  ldam = BLKLDD(A, m);
78  ldbm = BLKLDD(B, m);
79  for (k = 0; k < A.nt; k++) {
80  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
81  dbeta = k == 0 ? beta : 1.0;
83  uplo, trans,
84  tempnn, tempkn,
85  alpha, A(m, k), ldam,
86  B(m, k), ldbm,
87  dbeta, C(m, m), ldcm);
88  }
89  }
90  /*
91  * Plasma[Conj]Trans
92  */
93  else {
94  for (k = 0; k < A.mt; k++) {
95  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
96  ldak = BLKLDD(A, k);
97  ldbk = BLKLDD(B, k);
98  dbeta = k == 0 ? beta : 1.0;
100  uplo, trans,
101  tempnn, tempkm,
102  alpha, A(k, m), ldak,
103  B(k, m), ldbk,
104  dbeta, C(m, m), ldcm);
105  }
106  }
107  }
108  else {
109  if (trans == PlasmaNoTrans) {
110  ldam = BLKLDD(A, m);
111  ldan = BLKLDD(A, n);
112  ldbm = BLKLDD(B, m);
113  ldbn = BLKLDD(B, n);
114  /*
115  * PlasmaNoTrans / PlasmaLower
116  */
117  if (uplo == PlasmaLower) {
118  for (k = 0; k < A.nt; k++) {
119  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
120  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
121  CORE_zgemm(
122  trans, PlasmaConjTrans,
123  tempmm, tempnn, tempkn,
124  alpha, A(m, k), ldam,
125  B(n, k), ldbn,
126  zbeta, C(m, n), ldcm);
127 
128  CORE_zgemm(
129  trans, PlasmaConjTrans,
130  tempmm, tempnn, tempkn,
131  alpha, B(m, k), ldbm,
132  A(n, k), ldan,
133  zone, C(m, n), ldcm);
134  }
135  }
136  /*
137  * PlasmaNoTrans / PlasmaUpper
138  */
139  else {
140  for (k = 0; k < A.nt; k++) {
141  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
142  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
143  CORE_zgemm(
144  trans, PlasmaConjTrans,
145  tempnn, tempmm, tempkn,
146  alpha, A(n, k), ldan,
147  B(m, k), ldbm,
148  zbeta, C(n, m), ldcn);
149 
150  CORE_zgemm(
151  trans, PlasmaConjTrans,
152  tempnn, tempmm, tempkn,
153  alpha, B(n, k), ldbn,
154  A(m, k), ldam,
155  zone, C(n, m), ldcn);
156  }
157  }
158  }
159  else {
160  /*
161  * Plasma[Conj]Trans / PlasmaLower
162  */
163  if (uplo == PlasmaLower) {
164  for (k = 0; k < A.mt; k++) {
165  ldak = BLKLDD(A, k);
166  ldbk = BLKLDD(B, k);
167  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
168  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
169  CORE_zgemm(
170  trans, PlasmaNoTrans,
171  tempmm, tempnn, tempkm,
172  alpha, A(k, m), ldak,
173  B(k, n), ldbk,
174  zbeta, C(m, n), ldcm);
175 
176  CORE_zgemm(
177  trans, PlasmaNoTrans,
178  tempmm, tempnn, tempkm,
179  alpha, B(k, m), ldbk,
180  A(k, n), ldak,
181  zone, C(m, n), ldcm);
182  }
183  }
184  /*
185  * Plasma[Conj]Trans / PlasmaUpper
186  */
187  else {
188  for (k = 0; k < A.mt; k++) {
189  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
190  ldak = BLKLDD(A, k);
191  ldbk = BLKLDD(B, k);
192  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
193  CORE_zgemm(
194  trans, PlasmaNoTrans,
195  tempnn, tempmm, tempkm,
196  alpha, A(k, n), ldak,
197  B(k, m), ldbk,
198  zbeta, C(n, m), ldcm);
199 
200  CORE_zgemm(
201  trans, PlasmaNoTrans,
202  tempnn, tempmm, tempkm,
203  alpha, B(k, n), ldbk,
204  A(k, m), ldak,
205  zone, C(n, m), ldcn);
206  }
207  }
208  }
209  }
210  m = next_m;
211  n = next_n;
212  }
213 }
214 
215 /***************************************************************************/
220  double beta, PLASMA_desc C,
221  PLASMA_sequence *sequence, PLASMA_request *request)
222 {
225 
226  int m, n, k;
227  int ldak, ldam, ldan, ldcm, ldcn;
228  int ldbk, ldbm, ldbn;
229  int tempnn, tempmm, tempkn, tempkm;
230 
232  PLASMA_Complex64_t zbeta;
233  double dbeta;
234 
235  plasma = plasma_context_self();
236  if (sequence->status != PLASMA_SUCCESS)
237  return;
238  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
239 
240  for (n = 0; n < C.nt; n++) {
241  tempnn = n == C.nt-1 ? C.n-n*C.nb : C.nb;
242  ldan = BLKLDD(A, n);
243  ldbn = BLKLDD(B, n);
244  ldcn = BLKLDD(C, n);
245  /*
246  * PlasmaNoTrans
247  */
248  if (trans == PlasmaNoTrans) {
249  for (k = 0; k < A.nt; k++) {
250  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
251  dbeta = k == 0 ? beta : 1.0;
253  plasma->quark, &task_flags,
254  uplo, trans,
255  tempnn, tempkn, A.mb,
256  alpha, A(n, k), ldan, /* ldan * K */
257  B(n, k), ldbn,
258  dbeta, C(n, n), ldcn); /* ldc * N */
259  }
260  /*
261  * PlasmaNoTrans / PlasmaLower
262  */
263  if (uplo == PlasmaLower) {
264  for (m = n+1; m < C.mt; m++) {
265  tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
266  ldam = BLKLDD(A, m);
267  ldbm = BLKLDD(B, m);
268  ldcm = BLKLDD(C, m);
269  for (k = 0; k < A.nt; k++) {
270  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
271  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
273  plasma->quark, &task_flags,
274  trans, PlasmaConjTrans,
275  tempmm, tempnn, tempkn, A.mb,
276  alpha, A(m, k), ldam, /* ldam * K */
277  B(n, k), ldbn, /* ldan * K */
278  zbeta, C(m, n), ldcm); /* ldc * N */
279 
281  plasma->quark, &task_flags,
282  trans, PlasmaConjTrans,
283  tempmm, tempnn, tempkn, A.mb,
284  alpha, B(m, k), ldbm, /* ldam * K */
285  A(n, k), ldan, /* ldan * K */
286  zone, C(m, n), ldcm); /* ldc * N */
287  }
288  }
289  }
290  /*
291  * PlasmaNoTrans / PlasmaUpper
292  */
293  else {
294  for (m = n+1; m < C.mt; m++) {
295  tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
296  ldam = BLKLDD(A, m);
297  ldbm = BLKLDD(B, m);
298  for (k = 0; k < A.nt; k++) {
299  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
300  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
302  plasma->quark, &task_flags,
303  trans, PlasmaConjTrans,
304  tempnn, tempmm, tempkn, A.mb,
305  alpha, A(n, k), ldan, /* ldan * K */
306  B(m, k), ldbm, /* ldam * M */
307  zbeta, C(n, m), ldcn); /* ldc * M */
308 
310  plasma->quark, &task_flags,
311  trans, PlasmaConjTrans,
312  tempnn, tempmm, tempkn, A.mb,
313  alpha, B(n, k), ldan, /* ldan * K */
314  A(m, k), ldam, /* ldam * M */
315  zone, C(n, m), ldcn); /* ldc * M */
316  }
317  }
318  }
319  }
320  /*
321  * Plasma[Conj]Trans
322  */
323  else {
324  for (k = 0; k < A.mt; k++) {
325  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
326  ldak = BLKLDD(A, k);
327  ldbk = BLKLDD(B, k);
328  dbeta = k == 0 ? beta : 1.0;
330  plasma->quark, &task_flags,
331  uplo, trans,
332  tempnn, tempkm, A.mb,
333  alpha, A(k, n), ldak, /* lda * N */
334  B(k, n), ldbk,
335  dbeta, C(n, n), ldcn); /* ldc * N */
336  }
337  /*
338  * Plasma[Conj]Trans / PlasmaLower
339  */
340  if (uplo == PlasmaLower) {
341  for (m = n+1; m < C.mt; m++) {
342  tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
343  ldcm = BLKLDD(C, m);
344  for (k = 0; k < A.mt; k++) {
345  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
346  ldak = BLKLDD(A, k);
347  ldbk = BLKLDD(B, k);
348  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
350  plasma->quark, &task_flags,
351  trans, PlasmaNoTrans,
352  tempmm, tempnn, tempkm, A.mb,
353  alpha, A(k, m), ldak, /* lda * M */
354  B(k, n), ldbk, /* lda * N */
355  zbeta, C(m, n), ldcm); /* ldc * N */
356 
358  plasma->quark, &task_flags,
359  trans, PlasmaNoTrans,
360  tempmm, tempnn, tempkm, A.mb,
361  alpha, B(k, m), ldbk, /* lda * M */
362  A(k, n), ldak, /* lda * N */
363  zone, C(m, n), ldcm); /* ldc * N */
364  }
365  }
366  }
367  /*
368  * Plasma[Conj]Trans / PlasmaUpper
369  */
370  else {
371  for (m = n+1; m < C.mt; m++) {
372  tempmm = m == C.mt-1 ? C.m-m*C.mb : C.mb;
373  for (k = 0; k < A.mt; k++) {
374  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
375  ldak = BLKLDD(A, k);
376  ldbk = BLKLDD(B, k);
377  zbeta = k == 0 ? (PLASMA_Complex64_t)beta : zone;
379  plasma->quark, &task_flags,
380  trans, PlasmaNoTrans,
381  tempnn, tempmm, tempkm, A.mb,
382  alpha, A(k, n), ldak, /* lda * K */
383  B(k, m), ldbk, /* lda * M */
384  zbeta, C(n, m), ldcn); /* ldc * M */
385 
387  plasma->quark, &task_flags,
388  trans, PlasmaNoTrans,
389  tempnn, tempmm, tempkm, A.mb,
390  alpha, B(k, n), ldbk, /* lda * K */
391  A(k, m), ldak, /* lda * M */
392  zone, C(n, m), ldcn); /* ldc * M */
393  }
394  }
395  }
396  }
397  }
398 }