PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
pzshift.c
Go to the documentation of this file.
1 
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <assert.h>
24 #include "common.h"
25 #include "primes.h"
26 #include "gkkleader.h"
27 
61  int nprob, int me, int ne, int L,
62  PLASMA_sequence *sequence, PLASMA_request *request)
63 {
64  int *leaders = NULL;
65  int ngrp, thrdbypb, thrdtot, nleaders;
66 
67  /* Check Plasma context */
68  thrdtot = PLASMA_SIZE;
69  thrdbypb = PLASMA_GRPSIZE;
70  ngrp = thrdtot/thrdbypb;
71 
72  /* check input */
73  if( (nprob * me * ne * L) != (m * n) ) {
74  plasma_error(__func__, "problem size does not match matrix size");
75  /*printf("m=%d, n=%d, nprob=%d, me=%d, ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
76  return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
77  }
78  if( thrdbypb > thrdtot ) {
79  plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
80  return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
81  }
82  if( (thrdtot % thrdbypb) != 0 ) {
83  plasma_error(__func__, "number of thread per problem must divide the total number of thread");
84  return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
85  }
86 
87  /* quick return */
88  if( (me < 2) || (ne < 2) || (nprob < 1) ) {
89  return PLASMA_SUCCESS;
90  }
91 
92  GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
93  nleaders *= 3;
94 
96  int *Tp = NULL;
97  int i, ipb;
98  int owner;
99 
100  Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
101  for (i=0; i<thrdtot; i++)
102  Tp[i] = 0;
103 
104  ipb = 0;
105 
106  /* First part with coarse parallelism */
107  if (nprob > ngrp) {
108  ipb = (nprob / ngrp)*ngrp;
109 
110  /* loop over leader */
111  if (thrdbypb > 1) {
112  for (i=0; i<nleaders; i+=3) {
113  /* assign this cycle to a thread */
114  owner = minloc(thrdbypb, Tp);
115 
116  /* assign it to owner */
117  Tp[owner] = Tp[owner] + leaders[i+1] * L;
118  leaders[i+2] = owner;
119  }
120 
121  GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
122  }
123  else {
124  for (i=0; i<nleaders; i+=3) {
125  Tp[0] = Tp[0] + leaders[i+1] * L;
126  leaders[i+2] = 0;
127  }
128  }
129 
130  /* shift in parallel */
131  for (i=0; i< (nprob/ngrp); i++) {
133  int, me,
134  int, ne,
135  int, L,
136  PLASMA_Complex64_t*, &(A[i*ngrp*me*ne*L]),
137  int *, leaders,
138  int, nleaders,
139  int, thrdbypb,
140  PLASMA_sequence*, sequence,
141  PLASMA_request*, request);
142  }
143  }
144 
145  /* Second part with fine parallelism */
146  if (ipb < nprob) {
147  for (i=0; i<thrdtot; i++)
148  Tp[i] = 0;
149 
150  if (thrdtot > 1) {
151  /* loop over leader */
152  for (i=0; i<nleaders; i+=3) {
153  /* assign this cycle to a thread */
154  owner = minloc(thrdtot, Tp);
155 
156  /* assign it to owner */
157  Tp[owner] = Tp[owner] + leaders[i+1] * L;
158  leaders[i+2] = owner;
159  }
160  GKK_BalanceLoad(thrdtot, Tp, leaders, nleaders, L);
161  }
162  else {
163  for (i=0; i<nleaders; i+=3) {
164  Tp[0] = Tp[0] + leaders[i+1] * L;
165  leaders[i+2] = 0;
166  }
167  }
168 
169  /* shift in parallel */
170  for (i=ipb; i<nprob; i++) {
172  int, me,
173  int, ne,
174  int, L,
175  PLASMA_Complex64_t*, &(A[i*me*ne*L]),
176  int *, leaders,
177  int, nleaders,
178  int, thrdtot,
179  PLASMA_sequence*, sequence,
180  PLASMA_request*, request);
181  }
182  }
183 
184  plasma_shared_free(plasma, Tp);
185  }
186  /* Dynamic scheduling */
187  else {
189  int, me,
190  int, ne,
191  int, L,
192  PLASMA_Complex64_t*, A,
193  int *, leaders,
194  int, nleaders,
195  int, nprob,
196  PLASMA_sequence*, sequence,
197  PLASMA_request*, request);
198  }
199 
200  free(leaders);
201 
202  return PLASMA_SUCCESS;
203 }
204 
232  PLASMA_sequence *sequence;
233  PLASMA_request *request;
234  PLASMA_Complex64_t *A, *Al, *W;
235  int locrnk, myrank;
236  int i, x, snix, cl, iprob;
237  int n, m, L, nleaders, thrdbypb;
238  int *leaders;
239  int64_t s, q;
240 
241  plasma_unpack_args_9(m, n, L, A, leaders, nleaders, thrdbypb, sequence, request);
242  if (sequence->status != PLASMA_SUCCESS)
243  return;
244 
245  myrank = PLASMA_RANK;
246  locrnk = myrank % thrdbypb;
247  iprob = myrank / thrdbypb;
248 
249  q = m * n - 1;
250  Al = &(A[iprob*m*n*L]);
251 
253 
254  /* shift cycles in parallel. */
255  /* each thread shifts the cycles it owns. */
256  for(i=0; i<nleaders; i+=3) {
257  if( leaders[i+2] == locrnk ) {
258  /* cycle #i belongs to this thread, so shift it */
259  memcpy(W, &(Al[leaders[i]*L]), L*sizeof(PLASMA_Complex64_t));
260  CORE_zshiftw(leaders[i], leaders[i+1], m, n, L, Al, W);
261  }
262  else if( leaders[i+2] == -2 ) {
263  /* cycle #i has been split, so shift in parallel */
264  x = leaders[i+1] / thrdbypb;
265  cl = x;
266  if( locrnk == 0 ) {
267  cl = leaders[i+1] - x * (thrdbypb - 1);
268  }
269  s = leaders[i];
270  snix = (s * modpow(n, locrnk*x, m * n - 1)) % q;
271 
272  /* copy the block at s*n^(thid*x) (snix) */
273  memcpy(W, &(Al[snix*L]), L*sizeof(PLASMA_Complex64_t));
274 
275  /* wait for peers to finish copy their block. */
276  plasma_barrier(plasma);
277 
278  /* shift the linear array. */
279  if( cl > 0 ) {
280  CORE_zshiftw(snix, cl, m, n, L, Al, W);
281  }
282  }
283  }
284 
285  plasma_private_free(plasma, W);
286 }
287 
288 
289 void plasma_pzshift_quark(int m, int n, int L, PLASMA_Complex64_t *A,
290  int *leaders, int nleaders, int nprob,
291  PLASMA_sequence *sequence, PLASMA_request *request)
292 {
295  PLASMA_Complex64_t *Al;
296  int i, iprob, size;
297 
298  plasma = plasma_context_self();
299  if (sequence->status != PLASMA_SUCCESS)
300  return;
301  QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);
302 
303  size = m*n*L;
304 
305  for(iprob=0; iprob<nprob; iprob++) {
306  Al = &(A[iprob*size]);
307 
308  QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
309  sizeof(PLASMA_Complex64_t)*size, Al, INOUT,
310 #ifdef TRACE_IPT
311  13, "Foo In shift", VALUE | TASKLABEL,
312  4, "red", VALUE | TASKCOLOR,
313 #endif
314  0);
315 
316  /* shift cycles in parallel. */
317  for(i=0; i<nleaders; i+=3) {
318  //assert( leaders[i+2] != -2 );
319  QUARK_CORE_zshift(plasma->quark, &task_flags,
320  leaders[i], m, n, L, Al);
321  }
322 
323  QUARK_Insert_Task(plasma->quark, CORE_foo_quark, &task_flags,
324  sizeof(PLASMA_Complex64_t)*size, Al, INOUT,
325 #ifdef TRACE_IPT
326  14, "Foo Out shift", VALUE | TASKLABEL,
327  4, "red", VALUE | TASKCOLOR,
328 #endif
329  0);
330  }
331 }
332 
333