PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pzgbrdb.c
Go to the documentation of this file.
1 
15 #include "common.h"
16 #include <lapacke.h>
17 
18 #undef REAL
19 #define COMPLEX
20 
21 #define DEP(m) &(DEP[m])
22 #define A(_m, _n) (PLASMA_Complex64_t *)plasma_geteltaddr(&A, (_m), (_n), eltsize)
23 /***************************************************************************/
27  PLASMA_desc A, double *D, double *E, PLASMA_desc T,
28  PLASMA_sequence *sequence, PLASMA_request *request)
29 {
32 
33 #ifdef COMPLEX
34  static double dzero = (double) 0.0;
35  double absztmp;
36 #endif
37 
38  static PLASMA_Complex64_t zone = (PLASMA_Complex64_t) 1.0;
39  static PLASMA_Complex64_t zzero = (PLASMA_Complex64_t) 0.0;
40  PLASMA_Complex64_t *C, *S;
41  PLASMA_Complex64_t ztmp, V, TAU;
42  int M, N, NB, MINMN, INgrsiz, INthgrsiz, BAND;
43  int myid, grsiz, shift=3, stt, st, ed, stind, edind;
44  int blklastind, colpt, PCOL, ACOL, MCOL;
45  int stepercol,mylastid,grnb,grid;
46  int *DEP,*MAXID;
47  int i, j, m;
48  int thgrsiz, thgrnb, thgrid, thed;
49  size_t eltsize = plasma_element_size(A.dtyp);
50 
51  plasma = plasma_context_self();
52  if (sequence->status != PLASMA_SUCCESS)
53  return;
54 
55  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
56 
57  M = A.m;
58  N = A.n;
59  NB = A.mb;
60  MINMN = min(M,N);
61 
62  /* Quick return */
63  if ( MINMN == 0 ){
64  return;
65  }
66 
67  if ( NB == 0 ) {
68  memset(D, 0, MINMN *sizeof(double));
69  memset(E, 0, (MINMN-1)*sizeof(double));
70 #ifdef COMPLEX
71  for (i=0; i<MINMN; i++)
72  D[i] = cabs(*A(i,i));
73 #else
74  for (i=0; i<MINMN; i++)
75  D[i] = *A(i,i);
76 #endif
77  return;
78  }
79 
80  /*
81  * Barrier is used because the bulge have to wait until
82  * the reduction to band has been finish.
83  * otherwise, I can remove this BARRIER when I integrate
84  * the function dependencies link inside the reduction to
85  * band. Keep in mind the case when NB=1, where no bulge-chasing.
86  */
87  /***************************************************************/
88  QUARK_Barrier(plasma->quark);
89  /***************************************************************/
90 
91  /*
92  * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
93  * Make diagonal and superdiagonal elements real, storing them in
94  * D and E. if PlasmaLower, first transform lower bidiagonal form
95  * to upper bidiagonal by applying plane rotations/ Householder
96  * from the left, overwriting superdiagonal elements then make
97  * elements real of the resulting upper Bidiagonal. if PlasmaUpper
98  * then make its elements real. For Q, PT: ZSCAL should be done
99  * in case of WANTQ.
100  */
101  if ( NB == 1 ) {
102  memset(D, 0, MINMN*sizeof(double));
103  memset(E, 0, (MINMN-1)*sizeof(double));
104 
105  if(uplo==PlasmaLower){
106  for (i=0; i<(MINMN-1); i++)
107  {
108  /* generate Householder to annihilate a(i+1,i) and create a(i,i+1) */
109  V = *A((i+1), i);
110  *A((i+1), i) = zzero;
111  LAPACKE_zlarfg_work( 2, A(i, i), &V, 1, &TAU);
112  /* apply Left*/
113  TAU = conj(TAU);
114  ztmp = TAU*V;
115  V = conj(V);
116  *A(i, i+1) = - V * TAU * (*A(i+1, i+1));
117  *A(i+1, i+1) = *(A(i+1, i+1)) * (zone - V * ztmp);
118  }
119  }
120  /* PlasmaLower or PlasmaUpper, both are now upper */
121  /* Make diagonal and superdiagonal elements real,
122  * storing them in D and E
123  */
124 #ifdef COMPLEX
125  ztmp = zone;
126  for (i=0; i<MINMN; i++)
127  {
128  ztmp = *A(i, i) * conj(ztmp);
129  absztmp = cabs(ztmp);
130  D[i] = absztmp; /* diag value */
131  if(absztmp != dzero)
132  ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
133  else
134  ztmp = zone;
135  if(i<(MINMN-1)) {
136  ztmp = *A(i, (i+1)) * conj(ztmp);
137  absztmp = cabs(ztmp);
138  E[i] = absztmp; /* upper off-diag value */
139  if(absztmp != dzero)
140  ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
141  else
142  ztmp = zone;
143  }
144  }
145 #else
146  for (i=0; i < MINMN-1; i++) {
147  D[i] = *A(i, i );
148  E[i] = *A(i, i+1);
149  }
150  D[i] = *A(i, i);
151 #endif
152  return;
153  }
154 
155  /*
156  * Case MINMN<NB ==> matrix is very small and better to call lapack ZGETRD.
157  *
158  * Use fact that one row of block is stored the same way than in LAPACK
159  * Doesn't work if M > NB because of tile storage
160  */
161  if ( MINMN <= 0 )
162  {
163  PLASMA_Complex64_t *work, *taup, *tauq;
164  int info, ldwork = N*N;
165  work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble);
168 
169  info = LAPACKE_zgebrd_work(LAPACK_COL_MAJOR, M, N,
170  A(0,0), A.lm, D, E, taup, tauq, work, ldwork);
171  plasma_shared_free(plasma, (void*) work);
172  plasma_shared_free(plasma, (void*) taup);
173  plasma_shared_free(plasma, (void*) tauq);
174 
175  if( info == 0 )
176  sequence->status = PLASMA_SUCCESS;
177  else
178  plasma_sequence_flush(plasma->quark, sequence, request, info);
179  return;
180  }
181 
182  /* General case NB > 1 && N > NB */
183  DEP = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
184  MAXID = (int *) plasma_shared_alloc(plasma, MINMN+1, PlasmaInteger );
187  memset(MAXID,0,(MINMN+1)*sizeof(int));
188 
189  /***************************************************************************
190  * START BULGE CHASING CODE
191  **************************************************************************/
192  /*
193  * Initialisation of local parameter. those parameter should be
194  * input or tuned parameter.
195  */
196  INgrsiz = 1;
197  if( NB > 160 ) {
198  INgrsiz = 2;
199  }
200  else if( NB > 100 ) {
201  if( MINMN < 5000 )
202  INgrsiz = 2;
203  else
204  INgrsiz = 4;
205  } else {
206  INgrsiz = 6;
207  }
208  INthgrsiz = MINMN;
209  BAND = 0;
210 
211  grsiz = INgrsiz;
212  thgrsiz = INthgrsiz;
213  if( grsiz == 0 ) grsiz = 6;
214  if( thgrsiz == 0 ) thgrsiz = MINMN;
215 
216  i = shift/grsiz;
217  stepercol = i*grsiz == shift ? i:i+1;
218 
219  i = (MINMN-2)/thgrsiz;
220  thgrnb = i*thgrsiz == (MINMN-2) ? i:i+1;
221 
222  for (thgrid = 1; thgrid<=thgrnb; thgrid++){
223  stt = (thgrid-1)*thgrsiz+1;
224  thed = min( (stt + thgrsiz -1), (MINMN-2));
225  for (i = stt; i <= MINMN-2; i++){
226  ed=min(i,thed);
227  if(stt>ed)break;
228  for (m = 1; m <=stepercol; m++){
229  st=stt;
230  for (j = st; j <=ed; j++){
231  /* PCOL: dependency on the ID of the master of the group of the previous column. (Previous Column:PCOL). */
232  /* ACOL: dependency on the ID of the master of the previous group of my column. (Acctual Column:ACOL). (it is 0(NULL) for myid=1) */
233  /* MCOL: OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
234  myid = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
235  mylastid = myid+grsiz-1;
236  PCOL = mylastid+shift-1; /* to know the dependent ID of the previous column. need to know the master of its group*/
237  MAXID[j] = myid;
238  PCOL = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
239  grnb = PCOL/grsiz;
240  grid = grnb*grsiz == PCOL ? grnb:grnb+1;
241  PCOL = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column.*/
242  ACOL = myid-grsiz;
243  if(myid==1)ACOL=0;
244  MCOL = myid;
245 
247  plasma->quark, &task_flags,
248  uplo, MINMN, NB,
249  &A, C, S, i, j, m, grsiz, BAND,
250  DEP(PCOL), DEP(ACOL), DEP(MCOL) );
251 
252  if(mylastid%2 ==0){
253  blklastind = (mylastid/2)*NB+1+j-1;
254  }else{
255  colpt = ((mylastid+1)/2)*NB + 1 +j -1 ;
256  stind = colpt-NB+1;
257  edind = min(colpt,MINMN);
258  if( (stind>=edind-1) && (edind==MINMN) )
259  blklastind=MINMN;
260  else
261  blklastind=0;
262  }
263  if(blklastind >= (MINMN-1)) stt=stt+1;
264  } /* END for j=st:ed */
265  } /* END for m=1:stepercol */
266  } /* END for i=1:MINMN-2 */
267  } /* END for thgrid=1:thgrnb */
268 
269  /*
270  * Barrier used only for now, to be sure that everything
271  * is done before copying the D and E and free workspace.
272  * this will be removed later when D and E are directly filled
273  * during the bulge process.
274  */
275  QUARK_Barrier(plasma->quark);
276 
277  plasma_shared_free(plasma, (void*) DEP);
278  plasma_shared_free(plasma, (void*) MAXID);
279  plasma_shared_free(plasma, (void*) C);
280  plasma_shared_free(plasma, (void*) S);
281 
282  /*
283  * STORE THE RESULTING diagonal/off-diagonal in D AND E
284  */
285  memset(D, 0, MINMN*sizeof(double));
286  memset(E, 0, (MINMN-1)*sizeof(double));
287 
288  /*
289  * If PlasmaLower, first transform lower bidiagonal form
290  * to upper bidiagonal by applying plane rotations/ Householder
291  * from the left, overwriting superdiagonal elements then make
292  * elements real of the resulting upper Bidiagonal. if PlasmaUpper
293  * then make its elements real.
294  * For Q, PT: ZSCAL should be done in case of WANTQ.
295  */
296  if(uplo==PlasmaLower){
297  for (i=0; i<(MINMN-1); i++)
298  {
299  /* generate Householder to annihilate a(i+1,i) and create a(i,i+1)*/
300  V = *A((i+1), i);
301  *A((i+1), i) = zzero;
302  LAPACKE_zlarfg_work( 2, A(i, i), &V, 1, &TAU);
303  /* apply Left */
304  TAU = conj(TAU);
305  ztmp = TAU*V;
306  V = conj(V);
307  *A(i, (i+1)) = - V * TAU * (*A((i+1), (i+1)));
308  *A((i+1), (i+1)) = (*A((i+1), (i+1))) * (zone - V * ztmp);
309  }
310  }
311 
312  /* PlasmaLower or PlasmaUpper, both are upper, now*/
313  /* Make diagonal and superdiagonal elements real,
314  * storing them in D and E
315  */
316  /* In complex case, the element off diagonal element are
317  * not necessary real and we have to make off-diagonal
318  * elements real and copy them to E.
319  * When using HouseHolder elimination,
320  * the ZLARFG give us a real as output so, all the
321  * diagonal/off-diagonal element except the last one are already
322  * real and thus we need only to take the abs of the last
323  * one.
324  * */
325 #ifdef COMPLEX
326  ztmp =zone;
327  for (i=0; i < MINMN-1; i++) {
328  D[i] = creal( *A(i, i) );
329  /*
330  * Alternative for Householder case, all diag/superdiag
331  * are real except the last diag and superdiag, where we
332  * have to take the abs
333  */
334  if(i<(MINMN-2))
335  E[i] = creal(*A(i, i+1));
336  else
337  E[i] = cabs( *A(i, i+1)); /* last upper value is complex */
338  }
339  D[i] = cabs( *A(i, i) );
340 #else
341  for (i=0; i < MINMN-1; i++) {
342  D[i] = *A(i, i );
343  E[i] = *A(i, i+1);
344  }
345  D[i] = *A(i, i);
346 #endif
347 
348 } /* END FUNCTION */