24 static void CORE_zbarrier_thread(
const int thidx,
const int thcnt);
26 const int thidx,
const int thcnt,
28 const int pividx,
int *ipiv);
32 const int idxStart,
const int idxMax,
const int *piv);
86 psplit(
int n,
int pidx,
int pcnt,
int *poff_p,
int *psiz_p) {
87 int q = n / pcnt, r = n % pcnt;
95 *poff_p = r * (q + 1) + (pidx - r) * q;
99 #define AMAX1BUF_SIZE (48 << 1)
108 for (i = 0; i <
AMAX1BUF_SIZE; ++i) CORE_zamax1buf[i] = -1.0;
109 sfmin = LAPACKE_dlamch_work(
'S');
118 double curamx =
cabs(localamx);
121 for (i = 1; i < thcnt; ++i) {
122 while (CORE_zamax1buf[i << 1] == -1.0) {
128 for (i = 1; i < thcnt; ++i) {
129 tmp = CORE_zamax1buf[ (i << 1) + 1];
130 if (
cabs(tmp) > curamx) {
141 for (i = 1; i < thcnt; ++i)
142 CORE_zamax1buf[ (i << 1) + 1] = curval;
144 CORE_zamax1buf[0] = -j - 2.0;
149 for (i = 1; i < thcnt; ++i)
150 CORE_zamax1buf[i << 1] = -3.0;
153 for (i = 1; i < thcnt; ++i) {
154 while (CORE_zamax1buf[i << 1] != -1.0) {
158 CORE_zamax1buf[0] = -1.0;
160 CORE_zamax1buf[(thidx << 1) + 1] = localamx;
161 CORE_zamax1buf[thidx << 1] = -2.0;
162 while (CORE_zamax1buf[0] == -1.0) {
164 while (CORE_zamax1buf[thidx << 1] != -3.0) {
166 *globalamx = CORE_zamax1buf[(thidx << 1) + 1];
167 *thwinner = -CORE_zamax1buf[0] - 2.0;
168 CORE_zamax1buf[thidx << 1] = -1.0;
170 if (thidx == *thwinner)
173 while (CORE_zamax1buf[0] != -1.0) {
179 static inline void CORE_zgetrf_reclap_update(
const int M,
const int column,
const int n1,
const int n2,
181 const int thidx,
const int thcnt)
187 int coff, ccnt, lm, loff;
189 CORE_zbarrier_thread( thidx, thcnt );
191 psplit( n2, thidx, thcnt, &coff, &ccnt );
194 CORE_zlaswap1( ccnt, Atop2 + coff*LDA, LDA, column, n1 + column, IPIV );
197 n1, ccnt,
CBLAS_SADDR(posone), Atop + column, LDA, Atop2 + coff*LDA + column, LDA );
203 CORE_zbarrier_thread( thidx, thcnt );
205 psplit( M, thidx, thcnt, &loff, &lm );
216 CORE_zgetrf_reclap_rec(
const int M,
const int N,
218 int *IPIV,
int *info,
219 const int thidx,
const int thcnt,
const int column)
221 int jp, n1, n2, lm, loff;
232 CORE_zgetrf_reclap_rec( M, n1, A, LDA, IPIV, info,
233 thidx, thcnt, column );
237 CORE_zgetrf_reclap_update(M, column, n1, n2,
241 CORE_zgetrf_reclap_rec( M, n2, A, LDA, IPIV, info,
242 thidx, thcnt, column + n1 );
246 psplit( n1, thidx, thcnt, &coff, &ccnt );
249 CORE_zlaswap1( ccnt, Atop+coff*LDA, LDA, n1 + column, N + column, IPIV );
255 CORE_zbarrier_thread( thidx, thcnt );
257 psplit( M, thidx, thcnt, &loff, &lm );
267 tmp1 = Atop[loff + jp];
269 CORE_zamax1_thread( tmp1, thidx, thcnt, &thrd,
270 &tmp3, loff + jp + 1, IPIV + column );
275 if (
cabs(tmp3) >= sfmin ) {
277 n1 = (thidx == 0) ? 1 : 0;
282 n1 = (thidx == 0) ? 1 : 0;
283 Atop2 = Atop + loff + n1;
285 for( i=0; i < lm-n1; i++, Atop2++)
286 *Atop2 = *Atop2 / tmp3;
290 if (loff + jp != column)
291 Atop[loff + jp] = tmp2 / tmp3;
299 CORE_zbarrier_thread( thidx, thcnt );
303 #if defined(PLASMA_HAVE_WEAK)
304 #pragma weak CORE_zgetrf_reclap = PCORE_zgetrf_reclap
305 #define CORE_zgetrf_reclap PCORE_zgetrf_reclap
309 int *IPIV,
int *info)
312 int thcnt =
min( info[2], M / N );
313 int minMN =
min(M, N);
323 if( LDA <
max(1, M) ) {
331 if ( (M == 0) || (N == 0) || (thidx >= thcnt) ){
336 CORE_zgetrf_reclap_rec( M, minMN, A, LDA, IPIV, info,
340 CORE_zgetrf_reclap_update(M, 0, minMN, N-minMN,
352 int m,
int n,
int nb,
361 sizeof(
int), &m,
VALUE,
362 sizeof(
int), &n,
VALUE,
364 sizeof(
int), &lda,
VALUE,
365 sizeof(
int)*nb, IPIV,
OUTPUT,
369 sizeof(
int), &iinfo,
VALUE,
370 sizeof(
int), &nbthread,
VALUE,
377 #if defined(PLASMA_HAVE_WEAK)
378 #pragma weak CORE_zgetrf_reclap_quark = PCORE_zgetrf_reclap_quark
379 #define CORE_zgetrf_reclap_quark PCORE_zgetrf_reclap_quark
397 check_info, iinfo, maxthreads );
400 info[2] = maxthreads;
413 CORE_zbarrier_thread(
int thidx,
int thcnt)
418 CORE_zamax1_thread( 1.0, thidx, thcnt, &idum1, &ddum2, 0, &idum2 );
423 const int idxStart,
const int idxMax,
const int *piv)
428 for (j = 0; j < ncol; ++j) {
429 for (i = idxStart; i < idxMax; ++i) {
430 tmp = a[j*lda + piv[i] - 1];
431 a[j*lda + piv[i] - 1] = a[i + j*lda];