PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pdgetrf_rectil.c
Go to the documentation of this file.
1 
19 #include "common.h"
20 
21 void CORE_dgetrf_rectil_init(void);
22 
23 #define PARALLEL_KERNEL
24 #define A(m,n) BLKADDR(A, double, m, n)
25 #define IPIV(k) &(IPIV[(int64_t)A.mb*(int64_t)(k)])
26 
27 #define plasma_pdgetrf_rectil_rl_quark plasma_pdgetrf_rectil_quark
28 
29 /***************************************************************************/
33 {
34  int k, m, n;
36  int tempk, tempm, tempkm, tempkn, tempmm, tempnn;
37  int ldak, ldam;
40 
41  double zone = (double)1.0;
42  double mzone = (double)-1.0;
43 
44  void * fakedep;
45  /* How many threads per panel? Probably needs to be adjusted during factorization. */
46  int panel_thread_count;
47 
48  plasma = plasma_context_self();
49  if (sequence->status != PLASMA_SUCCESS)
50  return;
51  QUARK_Task_Flag_Set(&task_flagsP, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
52  QUARK_Task_Flag_Set(&task_flagsU, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
53 
54  /* We start at PLASMA_SIZE-1, to keep the first thread adding task to the queue */
55  panel_thread_count = min( max(PLASMA_SIZE, 2)-1, 48); /* kernel doesn't accept more than 48 cores */
56 
57  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
58 
60 
61  for (k = 0; k < min(A.mt, A.nt); k++)
62  {
63  tempk = k * A.mb;
64  tempm = A.m - tempk;
65  tempkm = k == A.mt-1 ? tempm : A.mb;
66  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
67  ldak = BLKLDD(A, k);
68 
70 
71  while ( ((panel_thread_count * 4 * A.mb) > tempm)
72  && (panel_thread_count > 1) ) {
73  panel_thread_count--;
74  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
75  }
76 
78 
80  plasma->quark, &task_flagsP,
81  plasma_desc_submatrix(A, tempk, k*A.nb, tempm, tempkn),
82  A(k, k), A.mb*A.nb, IPIV(k),
83  sequence, request, 1, tempk,
84  panel_thread_count );
85 
86  fakedep = (void *)(intptr_t)(k+1);
87  for (n = k+1; n < A.nt; n++)
88  {
90  /*
91  * Apply row interchange after the panel (work on the panel)
92  */
93  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
95  plasma->quark, &task_flagsU,
96  plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn),
97  A(k, n), 1, tempkm, IPIV(k), 1,
98  A(k, k), ldak);
99 
100  m = k+1;
101  if ( m < A.mt ) {
102  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
103  ldam = BLKLDD(A, m);
104 
106  plasma->quark, &task_flagsU,
108  tempmm, tempnn, A.nb, A.mb,
109  mzone, A(m, k), ldam,
110  A(k, n), ldak,
111  zone, A(m, n), ldam);
112 
113  for (m = k+2; m < A.mt; m++)
114  {
115  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
116  ldam = BLKLDD(A, m);
117 
119  plasma->quark, &task_flagsU,
121  tempmm, tempnn, A.nb, A.mb,
122  mzone, A(m, k), ldam,
123  A(k, n), ldak,
124  zone, A(m, n), ldam,
125  /* Dependency on next swapa (gemm need to be done before) */
126  A(k+1, n), A.mb*A.nb, INOUT | GATHERV,
127  /* Dependency on next swapb (gemm need to use panel k before it has to be swaped */
128  fakedep, 1, INPUT );
129  }
130  }
131  }
132  }
133 
134  for (k = 0; k < min(A.mt, A.nt); k++)
135  {
136  int mintmp;
137  tempk = k * A.mb;
138  tempm = A.m - tempk;
139  tempkm = k == A.mt-1 ? tempm : A.mb;
140  tempkn = k == A.nt-1 ? A.n - k * A.nb : A.nb;
141  mintmp = min(tempkm, tempkn);
142  ldak = BLKLDD(A, k);
143 
144  /*
145  * Apply row interchange behind the panel (work on the panel)
146  */
148  fakedep = (void*)(intptr_t)k;
149  for (n = 0; n < k; n++)
150  {
151  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
153  plasma->quark, &task_flagsU,
154  plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn),
155  A(k, n), 1, mintmp, IPIV(k), 1,
156  /* Dependency on previous swapb */
157  A(k-1,n), A.lm*A.nb, INPUT,
158  /* Dependency on all GEMM from previous step */
159  fakedep, 1, INOUT | GATHERV );
160  }
161  }
162 }
163 
164 /***************************************************************************/
168 {
169  int k, m, n;
171  int tempkm, tempkn, tempmm, tempnn;
172  int tempk, tempm;
173  int ldak, ldam;
176 
177  double zone = (double)1.0;
178  double mzone = (double)-1.0;
179 
180  void * fakedep;
181  /* How many threads per panel? Probably needs to be adjusted during factorization. */
182  int panel_thread_count;
183 
184  plasma = plasma_context_self();
185  if (sequence->status != PLASMA_SUCCESS)
186  return;
187  QUARK_Task_Flag_Set(&task_flagsP, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
188  QUARK_Task_Flag_Set(&task_flagsU, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
189 
190  /* We start at PLASMA_SIZE-1, to keep the first thread adding task to the queue */
191  panel_thread_count = min( max(PLASMA_SIZE, 2)-1, 48); /* kernel doesn't accept more than 48 cores */
192 
193  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
194 
196 
197  fakedep = (void*)(intptr_t)1;
198  for (n = 0; n < A.nt; n++)
199  {
200  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
201 
204 
205  for (k = 0; k < min(A.mt, n); k++)
206  {
207  tempk = k * A.mb;
208  tempm = A.m - tempk;
209  tempkm = k == A.mt-1 ? tempm : A.mb;
210  ldak = BLKLDD(A, k);
211 
213  plasma->quark, &task_flagsU,
214  plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn),
215  A(k, n), 1, tempkm, IPIV(k), 1,
216  A(k, k), ldak);
217 
218  if (k < A.mt-1) {
219  m = k+1;
220  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
221  ldam = BLKLDD(A, m);
222 
224  plasma->quark, &task_flagsU,
226  tempmm, tempnn, A.nb, A.mb,
227  mzone, A(m, k), ldam,
228  A(k, n), ldak,
229  zone, A(m, n), ldam);
230 
231  fakedep = (void*)(intptr_t)k;
232  for (m = k+2; m < A.mt; m++)
233  {
234  tempmm = m == A.mt-1 ? A.m-m*A.mb : A.mb;
235  ldam = BLKLDD(A, m);
236 
238  plasma->quark, &task_flagsU,
240  tempmm, tempnn, A.nb, A.mb,
241  mzone, A(m, k), ldam,
242  A(k, n), ldak,
243  zone, A(m, n), ldam,
244  /* Dependency on next swapa or getrf (gemm need to be done before) */
245  A(k+1, n), A.mb*A.nb, INOUT | GATHERV,
246  /* Dependency on next swapb (gemm need to use panel k before it has to be swaped */
247  fakedep, 1, INPUT );
248  }
249  }
250  }
251 
252  k = n;
253  if ( n < A.mt ) {
254  tempm = A.m - k * A.mb;
255  tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
256  ldak = BLKLDD(A, k);
257 
258  while ( ((panel_thread_count * 4 * A.mb) > tempm)
259  && (panel_thread_count > 1) ) {
260  panel_thread_count--;
261  QUARK_Task_Flag_Set(&task_flagsP, TASK_THREAD_COUNT, panel_thread_count );
262  }
263 
265  plasma->quark, &task_flagsP,
266  plasma_desc_submatrix(A, k*A.mb, k*A.nb, tempm, tempkn), A(k, k), A.mb*A.nb,
267  IPIV(k), sequence, request,
268  1, A.mb*k, panel_thread_count );
269  }
270  }
271 
273  for (k = 0; k < min(A.mt, A.nt); k++)
274  {
275  tempk = k * A.mb;
276  tempm = A.m - tempk;
277  tempkm = k == A.mt-1 ? tempm : A.mb;
278  ldak = BLKLDD(A, k);
279 
280  fakedep = (void*)(intptr_t)k;
281  for (n = 0; n < k; n++)
282  {
283  /*
284  * Apply row interchange behind the panel (work on the panel)
285  */
286  tempnn = n == A.nt-1 ? A.n-n*A.nb : A.nb;
288  plasma->quark, &task_flagsU,
289  plasma_desc_submatrix(A, tempk, n*A.nb, tempm, tempnn),
290  A(k, n), 1, tempkm, IPIV(k), 1,
291  /* Dependency on previous swapb */
292  A(k-1,n), A.lm*A.nb, INPUT,
293  /* Dependency on all GEMM from previous step */
294  fakedep, 1, INOUT | GATHERV );
295  }
296  }
297 }
298