PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
psgetrf_reclap.c
Go to the documentation of this file.
1 
19 #include "common.h"
20 
21 void CORE_sgetrf_reclap_init(void);
22 
23 #define PARALLEL_KERNEL
24 #define LAPACK_LAYOUT
25 #ifdef LAPACK_LAYOUT
26 #undef BLKLDD
27 #define BLKLDD(A, k) (A).lm
28 #define A(m,n) (&((float*)(A.mat))[(int64_t)(A.lm)*(int64_t)(A.nb)*(int64_t)(n)+(int64_t)(A.mb)*(int64_t)(m)])
29 #else
30 #define A(m,n) BLKADDR(A, float, m, n)
31 #endif
32 
33 #define IPIV(k) &(IPIV[(int64_t)A.mb*(int64_t)(k)])
34 
35 #define plasma_psgetrf_reclap_rl_quark plasma_psgetrf_reclap_quark
36 
37 /***************************************************************************/
41 {
42  int k, m, n, minmnt;
44  int tempkm, tempkn, tempmm, tempnn;
45  int tempm, tempk;
46  int ldak, ldam;
49 
50  float zone = (float)1.0;
51  float mzone = (float)-1.0;
52 
53  void * fakedep;
54  /* How many threads per panel? Probably needs to be adjusted during factorization. */
55  int panel_thread_count;
56 
57  plasma = plasma_context_self();
58  if (sequence->status != PLASMA_SUCCESS)
59  return;
60  QUARK_Task_Flag_Set(&task_flagsP, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
61  QUARK_Task_Flag_Set(&task_flagsU, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
62 
63  /* We start at PLASMA_SIZE-1, to keep the first thread adding task to the queue */
64  /* kernel doesn't accept more than 48 cores */
65  panel_thread_count = min(PLASMA_SIZE-1, 48);
66 
67  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
68 
70 
71  minmnt = min(A.mt, A.nt);
72  for (k = 0; k < minmnt; k++)
73  {
74  tempk = k * A.mb;
75  tempm = A.m - k * A.mb;
76  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
77  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
78  ldak = BLKLDD(A, k);
79 
81 #ifdef PARALLEL_KERNEL
82  while ( (panel_thread_count * 4 * A.mb) > tempm ) {
83  panel_thread_count--;
84  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
85  }
86 
87  if ( panel_thread_count > 1 ) {
89 
91  plasma->quark, &task_flagsP,
92  tempm, tempkn, A.nb,
93  A(k, k), ldak, IPIV(k),
94  sequence, request, 1, tempk,
95  panel_thread_count );
96  }
97  else {
99  plasma->quark, &task_flagsU,
100  tempm, tempkn, A.mb,
101  A(k, k), ldak, IPIV(k),
102  sequence, request, 1, tempk );
103  }
104 #else
106  plasma->quark, &task_flagsU,
107  tempm, tempkn, A.mb,
108  A(k, k), ldak, IPIV(k),
109  sequence, request, 1, tempk );
110 #endif
111 
112  fakedep = (void *)(intptr_t)(k+1);
113  for (n = k+1; n < A.nt; n++)
114  {
115 
117  /*
118  * Apply row interchange after the panel (work on the panel)
119  */
120  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
122  plasma->quark, &task_flagsU,
123  tempnn, A(k, n), A.lm, 1, tempkm, IPIV(k), 1);
124 
126  plasma->quark, &task_flagsU,
128  tempkm, tempnn, A.mb,
129  zone, A(k, k), ldak,
130  A(k, n), ldak);
131 
132  m = k+1;
133  if ( m < A.mt ) {
134  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
135  ldam = BLKLDD(A, m);
136 
138  plasma->quark, &task_flagsU,
140  tempmm, tempnn, A.nb, A.mb,
141  mzone, A(m, k), ldam,
142  A(k, n), ldak,
143  zone, A(m, n), ldam);
144 
145  for (m = k+2; m < A.mt; m++)
146  {
147  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
148  ldam = BLKLDD(A, m);
149 
151  plasma->quark, &task_flagsU,
153  tempmm, tempnn, A.nb, A.mb,
154  mzone, A(m, k), ldam,
155  A(k, n), ldak,
156  zone, A(m, n), ldam,
157  /* Dependency on next swapa (gemm need to be done before) */
158  A(k+1, n), A.mb*A.nb, INOUT | GATHERV,
159  /* Dependency on next swapb (gemm need to use panel k before it has to be swaped */
160  fakedep, 1, INPUT );
161  }
162  }
163  }
164  }
165 
166  for (k = 0; k < minmnt; k++)
167  {
168  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
169  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
170  tempk = min(tempkn, tempkm);
171  ldak = BLKLDD(A, k);
172 
173  /*
174  * Apply row interchange behind the panel (work on the panel)
175  */
177  fakedep = (void*)(intptr_t)k;
178  for (n = 0; n < k; n++)
179  {
181  plasma->quark, &task_flagsU,
182  A.nb, A(k, n), A.lm, 1, tempk, IPIV(k), 1,
183  /* Dependency on previous swapb */
184  A(k-1,n), A.lm*A.nb, INPUT,
185  /* Dependency on all GEMM from previous step */
186  fakedep, 1, INOUT | GATHERV );
187  }
188  }
189 }
190 
191 /***************************************************************************/
195 {
196  int k, m, n;
198  int tempkm, tempkn, tempmm, tempnn;
199  int tempm;
200  int ldak, ldam;
203 
204  float zone = (float)1.0;
205  float mzone = (float)-1.0;
206 
207  void * fakedep;
208  int panel_thread_count; /* How many threads per panel? Probably needs to be adjusted during factorization. */
209 
210  plasma = plasma_context_self();
211  if (sequence->status != PLASMA_SUCCESS)
212  return;
213  QUARK_Task_Flag_Set(&task_flagsP, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
214  QUARK_Task_Flag_Set(&task_flagsU, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
215 
216  /* We start at PLASMA_SIZE-1, to keep the first thread adding task to the queue */
217  panel_thread_count = min(PLASMA_SIZE-1, 48); /* kernel doesn't accept more than 48 cores */
218  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
219 
221 
222  fakedep = (void*)(intptr_t)1;
223  for (n = 0; n < A.nt; n++)
224  {
225  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
226 
229 
230  for (k = 0; k < min(A.mt, n); k++)
231  {
232  tempm = A.m - k * A.mb;
233  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
234  ldak = BLKLDD(A, k);
235 
237  plasma->quark, &task_flagsU,
238  tempnn, A(k, n), A.lm, 1, tempkm, IPIV(k), 1);
239 
241  plasma->quark, &task_flagsU,
243  tempkm, tempnn, A.mb,
244  zone, A(k, k), ldak,
245  A(k, n), ldak);
246 
247  if (k < A.mt-1) {
248  m = k+1;
249  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
250  ldam = BLKLDD(A, m);
251 
253  plasma->quark, &task_flagsU,
255  tempmm, tempnn, A.nb, A.mb,
256  mzone, A(m, k), ldam,
257  A(k, n), ldak,
258  zone, A(m, n), ldam);
259 
260  fakedep = (void*)(intptr_t)k;
261  for (m = k+2; m < A.mt; m++)
262  {
263  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
264  ldam = BLKLDD(A, m);
265 
267  plasma->quark, &task_flagsU,
269  tempmm, tempnn, A.nb, A.mb,
270  mzone, A(m, k), ldam,
271  A(k, n), ldak,
272  zone, A(m, n), ldam,
273  /* Dependency on next swapa or getrf (gemm need to be done before) */
274  A(k+1, n), A.mb*A.nb, INOUT | GATHERV,
275  /* Dependency on next swapb (gemm need to use panel k before it has to be swaped */
276  fakedep, 1, INPUT );
277  }
278  }
279  }
280 
281  k = n;
282  if ( n < A.mt ) {
283  tempm = A.m - k * A.mb;
284  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
285  ldak = BLKLDD(A, k);
286 
287 #ifdef PARALLEL_KERNEL
288  while ( (panel_thread_count * 4 * A.mb + 1) > tempm ) {
289  panel_thread_count = panel_thread_count >> 1;
290  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
291  }
292 
293  if ( panel_thread_count > 1 ) {
295  plasma->quark, &task_flagsP,
296  tempm, tempkn, A.mb,
297  A(k, k), ldak, IPIV(k),
298  sequence, request, 1, A.mb*k,
299  panel_thread_count );
300  } else {
302  plasma->quark, &task_flagsU,
303  tempm, tempkn, A.mb,
304  A(k, k), ldak, IPIV(k),
305  sequence, request, 1, A.mb*k );
306  }
307 #else
309  plasma->quark, &task_flagsU,
310  tempm, tempkn, A.mb,
311  A(k, k), ldak, IPIV(k),
312  sequence, request, 1, A.mb*k );
313 #endif
314  }
315  }
316 
317 
319  for (k = 0; k < min(A.mt, A.nt); k++)
320  {
321  tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
322  ldak = BLKLDD(A, k);
323 
324  fakedep = (void*)(intptr_t)k;
325  for (n = 0; n < k; n++)
326  {
327  /*
328  * Apply row interchange behind the panel (work on the panel)
329  */
331  plasma->quark, &task_flagsU,
332  A.nb, A(k, n), ldak, 1, tempkm, IPIV(k), 1,
333  /* Dependency on previous swapb */
334  A(k-1, n), A.lm*A.nb, INPUT,
335  /* Dependency on all GEMM from previous step */
336  fakedep, 1, INOUT | GATHERV );
337  }
338  }
339 }