23 #define PARALLEL_KERNEL
27 #define BLKLDD(A, k) (A).lm
28 #define A(m,n) (&((float*)(A.mat))[(int64_t)(A.lm)*(int64_t)(A.nb)*(int64_t)(n)+(int64_t)(A.mb)*(int64_t)(m)])
30 #define A(m,n) BLKADDR(A, float, m, n)
33 #define IPIV(k) &(IPIV[(int64_t)A.mb*(int64_t)(k)])
35 #define plasma_psgetrf_reclap_rl_quark plasma_psgetrf_reclap_quark
44 int tempkm, tempkn, tempmm, tempnn;
50 float zone = (float)1.0;
51 float mzone = (float)-1.0;
55 int panel_thread_count;
72 for (k = 0; k < minmnt; k++)
75 tempm = A.
m - k * A.
mb;
76 tempkm = k == A.
mt-1 ? A.
m-k*A.
mb : A.
mb;
77 tempkn = k == A.
nt-1 ? A.
n-k*A.
nb : A.
nb;
81 #ifdef PARALLEL_KERNEL
82 while ( (panel_thread_count * 4 * A.
mb) > tempm ) {
87 if ( panel_thread_count > 1 ) {
91 plasma->
quark, &task_flagsP,
93 A(k, k), ldak,
IPIV(k),
94 sequence, request, 1, tempk,
99 plasma->
quark, &task_flagsU,
101 A(k, k), ldak,
IPIV(k),
102 sequence, request, 1, tempk );
106 plasma->
quark, &task_flagsU,
108 A(k, k), ldak,
IPIV(k),
109 sequence, request, 1, tempk );
112 fakedep = (
void *)(intptr_t)(k+1);
113 for (n = k+1; n < A.
nt; n++)
120 tempnn = n == A.
nt-1 ? A.
n-n*A.
nb : A.
nb;
122 plasma->
quark, &task_flagsU,
123 tempnn, A(k, n), A.
lm, 1, tempkm, IPIV(k), 1);
126 plasma->
quark, &task_flagsU,
128 tempkm, tempnn, A.
mb,
134 tempmm = m == A.
mt-1 ? A.
m-m*A.
mb : A.
mb;
138 plasma->
quark, &task_flagsU,
140 tempmm, tempnn, A.
nb, A.
mb,
141 mzone, A(m, k), ldam,
143 zone,
A(m, n), ldam);
145 for (m = k+2; m < A.
mt; m++)
147 tempmm = m == A.
mt-1 ? A.
m-m*A.
mb : A.
mb;
151 plasma->
quark, &task_flagsU,
153 tempmm, tempnn, A.
nb, A.
mb,
154 mzone, A(m, k), ldam,
166 for (k = 0; k < minmnt; k++)
168 tempkm = k == A.
mt-1 ? A.
m-k*A.
mb : A.
mb;
169 tempkn = k == A.
nt-1 ? A.
n-k*A.
nb : A.
nb;
170 tempk =
min(tempkn, tempkm);
177 fakedep = (
void*)(intptr_t)k;
178 for (n = 0; n < k; n++)
181 plasma->
quark, &task_flagsU,
182 A.
nb, A(k, n), A.
lm, 1, tempk, IPIV(k), 1,
198 int tempkm, tempkn, tempmm, tempnn;
204 float zone = (float)1.0;
205 float mzone = (float)-1.0;
208 int panel_thread_count;
222 fakedep = (
void*)(intptr_t)1;
223 for (n = 0; n < A.
nt; n++)
225 tempnn = n == A.
nt-1 ? A.
n-n*A.
nb : A.
nb;
230 for (k = 0; k <
min(A.
mt, n); k++)
232 tempm = A.
m - k * A.
mb;
233 tempkm = k == A.
mt-1 ? A.
m-k*A.
mb : A.
mb;
237 plasma->
quark, &task_flagsU,
238 tempnn, A(k, n), A.
lm, 1, tempkm, IPIV(k), 1);
241 plasma->
quark, &task_flagsU,
243 tempkm, tempnn, A.
mb,
249 tempmm = m == A.
mt-1 ? A.
m-m*A.
mb : A.
mb;
253 plasma->
quark, &task_flagsU,
255 tempmm, tempnn, A.
nb, A.
mb,
256 mzone, A(m, k), ldam,
258 zone,
A(m, n), ldam);
260 fakedep = (
void*)(intptr_t)k;
261 for (m = k+2; m < A.
mt; m++)
263 tempmm = m == A.
mt-1 ? A.
m-m*A.
mb : A.
mb;
267 plasma->
quark, &task_flagsU,
269 tempmm, tempnn, A.
nb, A.
mb,
270 mzone, A(m, k), ldam,
283 tempm = A.
m - k * A.
mb;
284 tempkn = k == A.
nt-1 ? A.
n-k*A.
nb : A.
nb;
287 #ifdef PARALLEL_KERNEL
288 while ( (panel_thread_count * 4 * A.
mb + 1) > tempm ) {
289 panel_thread_count = panel_thread_count >> 1;
293 if ( panel_thread_count > 1 ) {
295 plasma->
quark, &task_flagsP,
297 A(k, k), ldak,
IPIV(k),
298 sequence, request, 1, A.
mb*k,
299 panel_thread_count );
302 plasma->
quark, &task_flagsU,
304 A(k, k), ldak,
IPIV(k),
305 sequence, request, 1, A.
mb*k );
309 plasma->
quark, &task_flagsU,
311 A(k, k), ldak,
IPIV(k),
312 sequence, request, 1, A.
mb*k );
319 for (k = 0; k <
min(A.
mt, A.
nt); k++)
321 tempkm = k == A.
mt-1 ? A.
m-k*A.
mb : A.
mb;
324 fakedep = (
void*)(intptr_t)k;
325 for (n = 0; n < k; n++)
331 plasma->
quark, &task_flagsU,
332 A.
nb, A(k, n), ldak, 1, tempkm,
IPIV(k), 1,