24 static void CORE_dbarrier_thread(
const int thidx,
const int thcnt);
25 static void CORE_damax1_thread(
const double localamx,
26 const int thidx,
const int thcnt,
27 int *thwinner,
double *globalamx,
28 const int pividx,
int *ipiv);
31 static void CORE_dlaswap1(
const int ncol,
double *a,
const int lda,
32 const int idxStart,
const int idxMax,
const int *piv);
86 psplit(
int n,
int pidx,
int pcnt,
int *poff_p,
int *psiz_p) {
87 int q = n / pcnt, r = n % pcnt;
95 *poff_p = r * (q + 1) + (pidx - r) * q;
99 #define AMAX1BUF_SIZE (48 << 1)
108 for (i = 0; i <
AMAX1BUF_SIZE; ++i) CORE_damax1buf[i] = -1.0;
109 sfmin = LAPACKE_dlamch_work(
'S');
113 CORE_damax1_thread(
double localamx,
int thidx,
int thcnt,
int *thwinner,
114 double *globalamx,
int pividx,
int *ipiv) {
117 double curval = localamx, tmp;
118 double curamx = fabs(localamx);
121 for (i = 1; i < thcnt; ++i) {
122 while (CORE_damax1buf[i << 1] == -1.0) {
128 for (i = 1; i < thcnt; ++i) {
129 tmp = CORE_damax1buf[ (i << 1) + 1];
130 if (fabs(tmp) > curamx) {
141 for (i = 1; i < thcnt; ++i)
142 CORE_damax1buf[ (i << 1) + 1] = curval;
144 CORE_damax1buf[0] = -j - 2.0;
149 for (i = 1; i < thcnt; ++i)
150 CORE_damax1buf[i << 1] = -3.0;
153 for (i = 1; i < thcnt; ++i) {
154 while (CORE_damax1buf[i << 1] != -1.0) {
158 CORE_damax1buf[0] = -1.0;
160 CORE_damax1buf[(thidx << 1) + 1] = localamx;
161 CORE_damax1buf[thidx << 1] = -2.0;
162 while (CORE_damax1buf[0] == -1.0) {
164 while (CORE_damax1buf[thidx << 1] != -3.0) {
166 *globalamx = CORE_damax1buf[(thidx << 1) + 1];
167 *thwinner = -CORE_damax1buf[0] - 2.0;
168 CORE_damax1buf[thidx << 1] = -1.0;
170 if (thidx == *thwinner)
173 while (CORE_damax1buf[0] != -1.0) {
179 static inline void CORE_dgetrf_reclap_update(
const int M,
const int column,
const int n1,
const int n2,
180 double *
A,
const int LDA,
int *
IPIV,
181 const int thidx,
const int thcnt)
183 static double posone = 1.0;
184 static double negone = -1.0;
185 double *Atop = A + column*LDA;
186 double *Atop2 = Atop + n1 *LDA;
187 int coff, ccnt, lm, loff;
189 CORE_dbarrier_thread( thidx, thcnt );
191 psplit( n2, thidx, thcnt, &coff, &ccnt );
194 CORE_dlaswap1( ccnt, Atop2 + coff*LDA, LDA, column, n1 + column, IPIV );
197 n1, ccnt, (posone), Atop + column, LDA, Atop2 + coff*LDA + column, LDA );
203 CORE_dbarrier_thread( thidx, thcnt );
205 psplit( M, thidx, thcnt, &loff, &lm );
212 (negone), Atop+loff, LDA, Atop2 + column, LDA, (posone), Atop2+loff, LDA );
216 CORE_dgetrf_reclap_rec(
const int M,
const int N,
217 double *A,
const int LDA,
218 int *IPIV,
int *info,
219 const int thidx,
const int thcnt,
const int column)
221 int jp, n1, n2, lm, loff;
222 double tmp1, tmp2, tmp3;
223 double *Atop = A + column*LDA;
232 CORE_dgetrf_reclap_rec( M, n1, A, LDA, IPIV, info,
233 thidx, thcnt, column );
237 CORE_dgetrf_reclap_update(M, column, n1, n2,
241 CORE_dgetrf_reclap_rec( M, n2, A, LDA, IPIV, info,
242 thidx, thcnt, column + n1 );
246 psplit( n1, thidx, thcnt, &coff, &ccnt );
249 CORE_dlaswap1( ccnt, Atop+coff*LDA, LDA, n1 + column, N + column, IPIV );
255 CORE_dbarrier_thread( thidx, thcnt );
257 psplit( M, thidx, thcnt, &loff, &lm );
267 tmp1 = Atop[loff + jp];
269 CORE_damax1_thread( tmp1, thidx, thcnt, &thrd,
270 &tmp3, loff + jp + 1, IPIV + column );
275 if ( fabs(tmp3) >= sfmin ) {
276 double tmp = (double)1.0 / tmp3;
277 n1 = (thidx == 0) ? 1 : 0;
278 cblas_dscal( lm - n1, (tmp), Atop + loff + n1, 1 );
282 n1 = (thidx == 0) ? 1 : 0;
283 Atop2 = Atop + loff + n1;
285 for( i=0; i < lm-n1; i++, Atop2++)
286 *Atop2 = *Atop2 / tmp3;
290 if (loff + jp != column)
291 Atop[loff + jp] = tmp2 / tmp3;
299 CORE_dbarrier_thread( thidx, thcnt );
303 #if defined(PLASMA_HAVE_WEAK)
304 #pragma weak CORE_dgetrf_reclap = PCORE_dgetrf_reclap
305 #define CORE_dgetrf_reclap PCORE_dgetrf_reclap
309 int *IPIV,
int *info)
312 int thcnt =
min( info[2], M / N );
313 int minMN =
min(M, N);
323 if( LDA <
max(1, M) ) {
331 if ( (M == 0) || (N == 0) || (thidx >= thcnt) ){
336 CORE_dgetrf_reclap_rec( M, minMN, A, LDA, IPIV, info,
340 CORE_dgetrf_reclap_update(M, 0, minMN, N-minMN,
352 int m,
int n,
int nb,
361 sizeof(
int), &m,
VALUE,
362 sizeof(
int), &n,
VALUE,
363 sizeof(
double)*nb*nb, A,
INOUT,
364 sizeof(
int), &lda,
VALUE,
365 sizeof(
int)*nb, IPIV,
OUTPUT,
369 sizeof(
int), &iinfo,
VALUE,
370 sizeof(
int), &nbthread,
VALUE,
377 #if defined(PLASMA_HAVE_WEAK)
378 #pragma weak CORE_dgetrf_reclap_quark = PCORE_dgetrf_reclap_quark
379 #define CORE_dgetrf_reclap_quark PCORE_dgetrf_reclap_quark
397 check_info, iinfo, maxthreads );
400 info[2] = maxthreads;
413 CORE_dbarrier_thread(
int thidx,
int thcnt)
418 CORE_damax1_thread( 1.0, thidx, thcnt, &idum1, &ddum2, 0, &idum2 );
422 CORE_dlaswap1(
const int ncol,
double *a,
const int lda,
423 const int idxStart,
const int idxMax,
const int *piv)
428 for (j = 0; j < ncol; ++j) {
429 for (i = idxStart; i < idxMax; ++i) {
430 tmp = a[j*lda + piv[i] - 1];
431 a[j*lda + piv[i] - 1] = a[i + j*lda];