PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
core_dgetrf_reclap.c
Go to the documentation of this file.
1 
19 #include <math.h>
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "common.h"
23 
24 static void CORE_dbarrier_thread(const int thidx, const int thcnt);
25 static void CORE_damax1_thread(const double localamx,
26  const int thidx, const int thcnt,
27  int *thwinner, double *globalamx,
28  const int pividx, int *ipiv);
29 
30 /* Laswp with inc = 1 */
31 static void CORE_dlaswap1(const int ncol, double *a, const int lda,
32  const int idxStart, const int idxMax, const int *piv);
33 
34 /***************************************************************************/
85 static void
86 psplit(int n, int pidx, int pcnt, int *poff_p, int *psiz_p) {
87  int q = n / pcnt, r = n % pcnt;
88 
89  if (pidx < r) {
90  q++;
91  *psiz_p = q;
92  *poff_p = pidx * q;
93  } else {
94  *psiz_p = q;
95  *poff_p = r * (q + 1) + (pidx - r) * q;
96  }
97 }
98 
99 #define AMAX1BUF_SIZE (48 << 1)
100 /* 48 threads should be enough for everybody */
101 static volatile double CORE_damax1buf[AMAX1BUF_SIZE];
102 static double sfmin;
103 
104 void
106  int i;
107 
108  for (i = 0; i < AMAX1BUF_SIZE; ++i) CORE_damax1buf[i] = -1.0;
109  sfmin = LAPACKE_dlamch_work('S');
110 }
111 
112 static void
113 CORE_damax1_thread(double localamx, int thidx, int thcnt, int *thwinner,
114  double *globalamx, int pividx, int *ipiv) {
115  if (thidx == 0) {
116  int i, j = 0;
117  double curval = localamx, tmp;
118  double curamx = fabs(localamx);
119 
120  /* make sure everybody filled in their value */
121  for (i = 1; i < thcnt; ++i) {
122  while (CORE_damax1buf[i << 1] == -1.0) { /* wait for thread i to store its value */
123  }
124  }
125 
126  /* better not fuse the loop above and below to make sure data is sync'd */
127 
128  for (i = 1; i < thcnt; ++i) {
129  tmp = CORE_damax1buf[ (i << 1) + 1];
130  if (fabs(tmp) > curamx) {
131  curamx = fabs(tmp);
132  curval = tmp;
133  j = i;
134  }
135  }
136 
137  if (0 == j)
138  ipiv[0] = pividx;
139 
140  /* make sure everybody knows the amax value */
141  for (i = 1; i < thcnt; ++i)
142  CORE_damax1buf[ (i << 1) + 1] = curval;
143 
144  CORE_damax1buf[0] = -j - 2.0; /* set the index of the winning thread */
145 
146  *thwinner = j;
147  *globalamx = curval;
148 
149  for (i = 1; i < thcnt; ++i)
150  CORE_damax1buf[i << 1] = -3.0;
151 
152  /* make sure everybody read the max value */
153  for (i = 1; i < thcnt; ++i) {
154  while (CORE_damax1buf[i << 1] != -1.0) {
155  }
156  }
157 
158  CORE_damax1buf[0] = -1.0;
159  } else {
160  CORE_damax1buf[(thidx << 1) + 1] = localamx;
161  CORE_damax1buf[thidx << 1] = -2.0; /* announce to thread 0 that local amax was stored */
162  while (CORE_damax1buf[0] == -1.0) { /* wait for thread 0 to finish calculating the global amax */
163  }
164  while (CORE_damax1buf[thidx << 1] != -3.0) { /* wait for thread 0 to store amax */
165  }
166  *globalamx = CORE_damax1buf[(thidx << 1) + 1]; /* read the amax from the location adjacent to the one in the above loop */
167  *thwinner = -CORE_damax1buf[0] - 2.0;
168  CORE_damax1buf[thidx << 1] = -1.0; /* signal thread 0 that this thread is done reading */
169 
170  if (thidx == *thwinner)
171  ipiv[0] = pividx;
172 
173  while (CORE_damax1buf[0] != -1.0) { /* wait for thread 0 to finish */
174  }
175  }
176 }
177 
178 
179 static inline void CORE_dgetrf_reclap_update(const int M, const int column, const int n1, const int n2,
180  double *A, const int LDA, int *IPIV,
181  const int thidx, const int thcnt)
182 {
183  static double posone = 1.0;
184  static double negone = -1.0;
185  double *Atop = A + column*LDA;
186  double *Atop2 = Atop + n1 *LDA;
187  int coff, ccnt, lm, loff;
188 
189  CORE_dbarrier_thread( thidx, thcnt );
190 
191  psplit( n2, thidx, thcnt, &coff, &ccnt );
192 
193  if (ccnt > 0) {
194  CORE_dlaswap1( ccnt, Atop2 + coff*LDA, LDA, column, n1 + column, IPIV ); /* swap to the right */
195 
197  n1, ccnt, (posone), Atop + column, LDA, Atop2 + coff*LDA + column, LDA );
198  }
199 
200  /* __sync_synchronize(); */ /* hopefully we will not need memory fences */
201 
202  /* need to wait for pivoting and triangular solve to finish */
203  CORE_dbarrier_thread( thidx, thcnt );
204 
205  psplit( M, thidx, thcnt, &loff, &lm );
206  if (thidx == 0) {
207  loff = column + n1;
208  lm -= column + n1;
209  };
210 
212  (negone), Atop+loff, LDA, Atop2 + column, LDA, (posone), Atop2+loff, LDA );
213 }
214 
215 static void
216 CORE_dgetrf_reclap_rec(const int M, const int N,
217  double *A, const int LDA,
218  int *IPIV, int *info,
219  const int thidx, const int thcnt, const int column)
220 {
221  int jp, n1, n2, lm, loff;
222  double tmp1, tmp2, tmp3;
223  double *Atop = A + column*LDA;
224 
225  /* Assumption: N = min( M, N ); */
226  if (N > 1) {
227  int coff, ccnt;
228 
229  n1 = N / 2;
230  n2 = N - n1;
231 
232  CORE_dgetrf_reclap_rec( M, n1, A, LDA, IPIV, info,
233  thidx, thcnt, column );
234  if ( *info != 0 )
235  return;
236 
237  CORE_dgetrf_reclap_update(M, column, n1, n2,
238  A, LDA, IPIV,
239  thidx, thcnt);
240 
241  CORE_dgetrf_reclap_rec( M, n2, A, LDA, IPIV, info,
242  thidx, thcnt, column + n1 );
243  if ( *info != 0 )
244  return;
245 
246  psplit( n1, thidx, thcnt, &coff, &ccnt );
247 
248  if (ccnt > 0) {
249  CORE_dlaswap1( ccnt, Atop+coff*LDA, LDA, n1 + column, N + column, IPIV ); /* swap to the left */
250  }
251 
252  } else {
253  int thrd;
254 
255  CORE_dbarrier_thread( thidx, thcnt );
256 
257  psplit( M, thidx, thcnt, &loff, &lm );
258 
259  if (thidx == 0) {
260  loff = column;
261  lm -= column;
262  }
263 
264  tmp2 = Atop[column]; /* all threads read the pivot element in case they need it */
265 
266  jp = cblas_idamax( lm, Atop + loff, 1 );
267  tmp1 = Atop[loff + jp];
268 
269  CORE_damax1_thread( tmp1, thidx, thcnt, &thrd,
270  &tmp3, loff + jp + 1, IPIV + column );
271 
272  Atop[column] = tmp3; /* all threads set the pivot element: no need for synchronization */
273 
274  if ( tmp3 != 0.0 ) {
275  if ( fabs(tmp3) >= sfmin ) {
276  double tmp = (double)1.0 / tmp3;
277  n1 = (thidx == 0) ? 1 : 0;
278  cblas_dscal( lm - n1, (tmp), Atop + loff + n1, 1 );
279  } else {
280  int i;
281  double *Atop2;
282  n1 = (thidx == 0) ? 1 : 0;
283  Atop2 = Atop + loff + n1;
284 
285  for( i=0; i < lm-n1; i++, Atop2++)
286  *Atop2 = *Atop2 / tmp3;
287  }
288 
289  if (thrd == thidx) { /* the thread that owns the best pivot */
290  if (loff + jp != column) /* if there is a need to exchange the pivot */
291  Atop[loff + jp] = tmp2 / tmp3;
292  }
293 
294  } else {
295  *info = column + 1;
296  return;
297  }
298 
299  CORE_dbarrier_thread( thidx, thcnt );
300  }
301 }
302 
303 #if defined(PLASMA_HAVE_WEAK)
304 #pragma weak CORE_dgetrf_reclap = PCORE_dgetrf_reclap
305 #define CORE_dgetrf_reclap PCORE_dgetrf_reclap
306 #endif
307 int CORE_dgetrf_reclap(int M, int N,
308  double *A, int LDA,
309  int *IPIV, int *info)
310 {
311  int thidx = info[1];
312  int thcnt = min( info[2], M / N );
313  int minMN = min(M, N);
314 
315  if( M < 0 ) {
316  coreblas_error(1, "illegal value of M");
317  return -1;
318  }
319  if( N < 0 ) {
320  coreblas_error(2, "illegal value of N");
321  return -2;
322  }
323  if( LDA < max(1, M) ) {
324  coreblas_error(5, "illegal value of LDA");
325  return -5;
326  }
327 
328  /*
329  * Quick return
330  */
331  if ( (M == 0) || (N == 0) || (thidx >= thcnt) ){
332  return PLASMA_SUCCESS;
333  }
334 
335  *info = 0;
336  CORE_dgetrf_reclap_rec( M, minMN, A, LDA, IPIV, info,
337  thidx, thcnt, 0 );
338 
339  if ( N > minMN ) {
340  CORE_dgetrf_reclap_update(M, 0, minMN, N-minMN,
341  A, LDA, IPIV,
342  thidx, thcnt);
343  }
344 
345  return info[0];
346 }
347 
348 /***************************************************************************/
352  int m, int n, int nb,
353  double *A, int lda,
354  int *IPIV,
355  PLASMA_sequence *sequence, PLASMA_request *request,
356  PLASMA_bool check_info, int iinfo,
357  int nbthread)
358 {
360  QUARK_Insert_Task(quark, CORE_dgetrf_reclap_quark, task_flags,
361  sizeof(int), &m, VALUE,
362  sizeof(int), &n, VALUE,
363  sizeof(double)*nb*nb, A, INOUT,
364  sizeof(int), &lda, VALUE,
365  sizeof(int)*nb, IPIV, OUTPUT,
366  sizeof(PLASMA_sequence*), &sequence, VALUE,
367  sizeof(PLASMA_request*), &request, VALUE,
368  sizeof(PLASMA_bool), &check_info, VALUE,
369  sizeof(int), &iinfo, VALUE,
370  sizeof(int), &nbthread, VALUE,
371  0);
372 }
373 
374 /***************************************************************************/
377 #if defined(PLASMA_HAVE_WEAK)
378 #pragma weak CORE_dgetrf_reclap_quark = PCORE_dgetrf_reclap_quark
379 #define CORE_dgetrf_reclap_quark PCORE_dgetrf_reclap_quark
380 #endif
382 {
383  int M;
384  int N;
385  double *A;
386  int LDA;
387  int *IPIV;
388  PLASMA_sequence *sequence;
389  PLASMA_request *request;
390  PLASMA_bool check_info;
391  int iinfo;
392 
393  int info[3];
394  int maxthreads;
395 
396  quark_unpack_args_10(quark, M, N, A, LDA, IPIV, sequence, request,
397  check_info, iinfo, maxthreads );
398 
399  info[1] = QUARK_Get_RankInTask(quark);
400  info[2] = maxthreads;
401 
402  CORE_dgetrf_reclap( M, N, A, LDA, IPIV, info );
403  if (info[1] == 0 && info[0] != PLASMA_SUCCESS && check_info)
404  plasma_sequence_flush(quark, sequence, request, iinfo + info[0] );
405 }
406 
407 
408 /*******************************************************************
409  * Additional routines
410  */
411 
412 static void
413 CORE_dbarrier_thread(int thidx, int thcnt)
414 {
415  int idum1, idum2;
416  double ddum2;
417  /* it's probably faster to implement a dedicated barrier */
418  CORE_damax1_thread( 1.0, thidx, thcnt, &idum1, &ddum2, 0, &idum2 );
419 }
420 
421 static void
422 CORE_dlaswap1(const int ncol, double *a, const int lda,
423  const int idxStart, const int idxMax, const int *piv)
424 {
425  int i, j;
426  double tmp;
427 
428  for (j = 0; j < ncol; ++j) {
429  for (i = idxStart; i < idxMax; ++i) {
430  tmp = a[j*lda + piv[i] - 1];
431  a[j*lda + piv[i] - 1] = a[i + j*lda];
432  a[i + j*lda] = tmp;
433  }
434  }
435 }
436