PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pdlansy.c
Go to the documentation of this file.
1 
16 #include <stdlib.h>
17 #include <math.h>
18 #include "common.h"
19 
20 #define A(m, n, i, j, ldt) (BLKADDR(A, double, m, n)+((j)*(ldt)+(i)))
21 /***************************************************************************/
25 {
28  PLASMA_desc A;
29  double *work;
30  double *result;
31  PLASMA_sequence *sequence;
32  PLASMA_request *request;
33 
34  int m, n;
35  int next_m;
36  int next_n;
37  int ldam, ldan;
38  int step, lrank;
39  int X, X1, X2, Y, Y1, Y2;
40 
41  double* lwork;
42  double normtmp, normtmp2;
43 
44  plasma_unpack_args_7(norm, uplo, A, work, result, sequence, request);
45  *result = 0.0;
46 
47  if (PLASMA_RANK == 0)
48  memset(work, 0, PLASMA_SIZE*sizeof(double));
49  ss_init(PLASMA_SIZE, 1, 0);
50 
51  switch (norm) {
52  /*
53  * PlasmaMaxNorm
54  */
55  case PlasmaMaxNorm:
56  n = 0;
57  m = PLASMA_RANK;
58  while (m >= A.mt && n < A.nt) {
59  n++;
60  m = m-A.mt+n;
61  }
62 
63  while (n < A.nt) {
64  next_m = m;
65  next_n = n;
66 
67  next_m += PLASMA_SIZE;
68  while (next_m >= A.mt && next_n < A.nt) {
69  next_n++;
70  next_m = next_m-A.mt+next_n;
71  }
72 
73  if (m == n) {
74  X1 = m == 0 ? A.i %A.mb : 0;
75  X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
76  X = X2 - X1;
77 
78  ldam = BLKLDD(A, m);
79  CORE_dlansy(PlasmaMaxNorm, uplo, X, A(m, n, X1, X1, ldam), ldam, NULL, &normtmp);
80  }
81  else {
82  /*
83  * PlasmaLower
84  */
85  if (uplo == PlasmaLower) {
86  X1 = m == 0 ? A.i %A.mb : 0;
87  X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
88  X = X2 - X1;
89 
90  Y1 = n == 0 ? A.j %A.nb : 0;
91  Y2 = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
92  Y = Y2 - Y1;
93 
94  ldam = BLKLDD(A, m);
95  CORE_dlange(PlasmaMaxNorm, X, Y, A(m, n, X1, Y1, ldam), ldam, NULL, &normtmp);
96  }
97  /*
98  * PlasmaUpper
99  */
100  else {
101  X1 = n == 0 ? A.i %A.mb : 0;
102  X2 = n == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
103  X = X2 - X1;
104 
105  Y1 = m == 0 ? A.j %A.nb : 0;
106  Y2 = m == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
107  Y = Y2 - Y1;
108 
109  ldan = BLKLDD(A, n);
110  CORE_dlange(PlasmaMaxNorm, X, Y, A(n, m, X1, Y1, ldan), ldan, NULL, &normtmp);
111  }
112  }
113  if (normtmp > work[PLASMA_RANK])
114  work[PLASMA_RANK] = normtmp;
115 
116  m = next_m;
117  n = next_n;
118  }
119  ss_cond_set(PLASMA_RANK, 0, 1);
120  break;
121  /*
122  * PlasmaOneNorm / PlasmaInfNorm
123  */
124  case PlasmaOneNorm:
125  case PlasmaInfNorm:
126  m = PLASMA_RANK;
127  normtmp2 = 0.0;
128  lwork = (double*)plasma_private_alloc(plasma, A.mb, PlasmaRealDouble);
129 
130  while (m < A.mt) {
131  X1 = m == 0 ? A.i %A.mb : 0;
132  X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
133  X = X2 - X1;
134 
135  ldam = BLKLDD(A, m);
136  memset(lwork, 0, A.mb*sizeof(double));
137  /*
138  * PlasmaLower
139  */
140  if (uplo == PlasmaLower) {
141  for (n = 0; n < m; n++) {
142  Y1 = n == 0 ? A.j%A.nb : 0;
143  Y = A.nb - Y1;
144  CORE_dasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, Y1, ldam), ldam, lwork);
145  }
146  CORE_dasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
147 
148  for (n = m+1; n < A.mt; n++) {
149  Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
150  ldan = BLKLDD(A, n);
151  CORE_dasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, 0, X1, ldan), ldan, lwork);
152  }
153  }
154  /*
155  * PlasmaUpper
156  */
157  else {
158  for (n = 0; n < m; n++) {
159  Y1 = n == 0 ? A.j%A.nb : 0;
160  Y = A.nb - Y1;
161  CORE_dasum(PlasmaColumnwise, PlasmaUpperLower, Y, X, A(n, m, Y1, X1, A.nb), A.nb, lwork);
162  }
163  CORE_dasum(PlasmaRowwise, uplo, X, X, A(m, m, X1, X1, ldam), ldam, lwork);
164 
165  for ( n =m+1; n < A.mt; n++) {
166  Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
167  CORE_dasum(PlasmaRowwise, PlasmaUpperLower, X, Y, A(m, n, X1, 0, ldam), ldam, lwork);
168  }
169  }
170  CORE_dlange(PlasmaMaxNorm, X, 1, lwork, 1, NULL, &normtmp);
171  if (normtmp > normtmp2)
172  normtmp2 = normtmp;
173 
174  m += PLASMA_SIZE;
175  }
176  work[PLASMA_RANK] = normtmp2;
177  ss_cond_set(PLASMA_RANK, 0, 1);
178  plasma_private_free(plasma, lwork);
179  break;
180  /*
181  * PlasmaFrobeniusNorm
182  */
183  case PlasmaFrobeniusNorm:
184  default:;
185  }
186 
187  if (norm != PlasmaFrobeniusNorm) {
188  step = 1;
189  lrank = PLASMA_RANK;
190  while ( (lrank%2 == 0) && (PLASMA_RANK+step < PLASMA_SIZE) ) {
191  ss_cond_wait(PLASMA_RANK+step, 0, step);
192  work[PLASMA_RANK] = max(work[PLASMA_RANK], work[PLASMA_RANK+step]);
193  lrank = lrank >> 1;
194  step = step << 1;
195  ss_cond_set(PLASMA_RANK, 0, step);
196  }
197  if (PLASMA_RANK > 0) {
198  while( lrank != 0 ) {
199  if (lrank%2 == 1) {
200  ss_cond_set(PLASMA_RANK, 0, step);
201  lrank = 0;
202  } else {
203  lrank = lrank >> 1;
204  step = step << 1;
205  ss_cond_set(PLASMA_RANK, 0, step);
206  }
207  }
208  }
209 
210  if (PLASMA_RANK == 0)
211  *result = work[0];
212  }
213  ss_finalize();
214 }
215 
216 /***************************************************************************/
219 void plasma_pdlansy_quark(PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc A, double *work, double *result,
220  PLASMA_sequence *sequence, PLASMA_request *request)
221 {
224 
225  int X, X1, X2, Y, Y1;
226  int ldam;
227  int m, n;
228  int szeW, pos;
229  double* lwork;
230 
231  plasma = plasma_context_self();
232  if (sequence->status != PLASMA_SUCCESS)
233  return;
234  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
235 
236  *result = 0.0;
237  switch ( norm ) {
238  /*
239  * PlasmaMaxNorm
240  */
241  case PlasmaMaxNorm:
242  szeW = A.mt*(A.mt+1)/2;
243  pos = 0;
244  lwork = (double*)plasma_shared_alloc(plasma, szeW, PlasmaRealDouble);
245  memset(lwork, 0, szeW*sizeof(double));
246  for(m = 0; m < A.mt; m++) {
247  X1 = m == 0 ? A.i %A.mb : 0;
248  X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
249  X = X2 - X1;
250  ldam = BLKLDD(A, m);
252  plasma->quark, &task_flags,
253  PlasmaMaxNorm, uplo, X,
254  A(m, m, X1, X1, ldam), ldam, ldam*X,
255  0, &(lwork[pos]),
256  lwork, szeW);
257  pos++;
258  /*
259  * PlasmaLower
260  */
261  if (uplo == PlasmaLower) {
262  for(n=0; n<m; n++) {
263  Y1 = n == 0 ? A.j%A.nb : 0;
264  Y = A.nb - Y1;
266  plasma->quark, &task_flags,
267  PlasmaMaxNorm, X, Y,
268  A(m, n, X1, Y1, ldam), ldam, ldam*Y,
269  0, &(lwork[pos]),
270  lwork, szeW);
271  pos++;
272  }
273  }
274  /*
275  * PlasmaUpper
276  */
277  else {
278  for(n = m+1; n < A.mt; n++) {
279  Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
281  plasma->quark, &task_flags,
282  PlasmaMaxNorm, X, Y,
283  A(m, n, X1, 0, ldam), ldam, ldam*Y,
284  0, &(lwork[pos]),
285  lwork, szeW);
286  pos++;
287  }
288  }
289  }
291  plasma->quark, &task_flags,
292  PlasmaMaxNorm, szeW, 1,
293  lwork, 1, szeW,
294  0, result);
295 
296  QUARK_CORE_free(plasma->quark, &task_flags, lwork, szeW*sizeof(double));
297  break;
298  /*
299  * PlasmaOneNorm / PlasmaInfNorm
300  */
301  case PlasmaOneNorm:
302  case PlasmaInfNorm:
303  lwork = (double *)plasma_shared_alloc(plasma, A.m+1, PlasmaRealDouble);
304  memset(lwork, 0, (A.m+1)*sizeof(double));
305  for(m = 0; m < A.mt; m++) {
306  X1 = m == 0 ? A.i %A.mb : 0;
307  X2 = m == A.mt-1 ? (A.i+A.m-1)%A.mb+1 : A.mb;
308  X = X2 - X1;
309  ldam = BLKLDD(A, m);
311  plasma->quark, &task_flags,
312  PlasmaRowwise, uplo, X, X,
313  A(m, m, X1, X1, ldam), ldam, ldam*X,
314  &(lwork[m*A.mb+1]), A.mb,
315  lwork, A.m);
316  /*
317  * PlasmaLower
318  */
319  if (uplo == PlasmaLower) {
320  for(n = 0; n < m; n++) {
321  Y1 = n == 0 ? A.j%A.nb : 0;
322  Y = A.nb - Y1;
324  plasma->quark, &task_flags,
326  A(m, n, X1, Y1, ldam), ldam, ldam*Y,
327  &(lwork[m*A.mb+1]), A.mb,
328  lwork, A.m);
330  plasma->quark, &task_flags,
332  A(m, n, X1, Y1, ldam), ldam, ldam*Y,
333  &(lwork[n*A.mb+1]), A.mb,
334  lwork, A.m);
335  }
336  }
337  /*
338  * PlasmaUpper
339  */
340  else {
341  for(n = m+1; n < A.mt; n++) {
342  Y = n == A.nt-1 ? (A.j+A.n-1)%A.nb+1 : A.nb;
344  plasma->quark, &task_flags,
346  A(m, n, X1, 0, ldam), ldam, ldam*Y,
347  &(lwork[m*A.mb+1]), A.mb,
348  lwork, A.m);
350  plasma->quark, &task_flags,
352  A(m, n, X1, 0, ldam), ldam, ldam*Y,
353  &(lwork[n*A.mb+1]), A.mb,
354  lwork, A.m);
355  }
356  }
357  }
359  plasma->quark, &task_flags,
360  PlasmaMaxNorm, A.m+1, 1,
361  lwork, 1, A.m+1,
362  0, result);
363 
364  QUARK_CORE_free(plasma->quark, &task_flags, lwork, (A.m+1)*sizeof(double));
365  break;
366  /*
367  * PlasmaFrobeniusNorm - not implemented
368  */
369  case PlasmaFrobeniusNorm:
370  default:;
371  }
372 }