PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
gkkleader.c
Go to the documentation of this file.
1 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <math.h>
22 #include <string.h>
23 #include "common.h"
24 #include "primes.h"
25 #include "gkkleader.h"
26 
47 int GKK_doublingtable(int x, int m, int emax, int *dt) {
48  int64_t y8, m8;
49  int i, numbits;
50  int z;
51 
52  /* Convert in int64 */
53  m8 = m;
54 
55  /* find the number of bits in emax */
56  numbits = 0;
57  z = emax;
58  while( z > 0 ) {
59  numbits = numbits + 1;
60  z = z / 2;
61  }
62 
63  if (numbits > PWR_MAXSIZE) {
64  plasma_error(__func__, "PWR_MAXSIZE too small");
66  }
67 
68  /* compute doubling table */
69  y8 = x;
70  dt[0] = x;
71  for (i=1; i < numbits; i++){
72  y8 = (y8*y8) % m8;
73  dt[i] = (int)y8;
74  }
75  return PLASMA_SUCCESS;
76 }
77 
102 int GKK_modpow(int *dt, int e, int m) {
103  int64_t y8, m8;
104  int z, i;
105 
106  m8 = m;
107  y8 = 1;
108  z = e;
109  i = 0;
110 
111  /* dt size is not tested because tested before in GKK_doublingtable */
112  while( z > 0 ) {
113  if( (z%2) == 1 )
114  y8 = (y8*dt[i]) % m8;
115  z = z / 2;
116  i++;
117  }
118  return (int)y8;
119 }
120 
149 #define GCAND_SIZE 46
150 int GKK_primroot(int p, int e, primedec_t *pr_p1, int t_p1) {
151  static int gcand[GCAND_SIZE] = {
152  2, 3, 5, 6, 7, 10, 11, 12, 13, 14,
153  15, 17, 18, 19, 20, 21, 22, 23, 24, 26,
154  28, 29, 30, 31, 33, 34, 35, 37, 38, 39,
155  41, 43, 44, 46, 47, 51, 52, 53, 55, 57,
156  58, 59, 60, 62, 69, 73 };
157 
158  int dt[PWR_MAXSIZE];
159  int i, j, p2, c, g;
160  int ok;
161 
162  p2 = p;
163  if( e >= 2 )
164  p2 = p*p;
165 
166  /* try candidates */
167  for(i=0; i < GCAND_SIZE; i++) {
168  g = gcand[i];
169  GKK_doublingtable(g, p2, p-1, dt);
170 
171  ok = 1;
172 
173  for(j=0; j<t_p1; j++) {
174  c = GKK_modpow(dt, (p-1) / pr_p1[j].p, p2);
175  if( (c % p) == 1 ) {
176  ok = 0;
177  break;
178  }
179  }
180  if( ok == 1 )
181  break;
182  }
183 
184  if( ok != 1 ) {
185  plasma_error(__func__, "failed to find a primitive root");
186  return -1;
187  }
188  /* check that g is a primitive root also for p^k for all k */
189  if( e >= 2 ) {
190  c = GKK_modpow(dt, p-1, p2);
191  if( c == 1 ) {
192  /* g is not a primitive root for p^k, but g+p is */
193  g = g + p;
194  }
195  }
196  return g;
197 }
198 
232 int GKK_multorder(int n, int p, int e, int pe, primedec_t *pr_p1, int t_p1) {
233  int fi[PRIME_MAXSIZE], dt[PWR_MAXSIZE];
234  int t, i, K1, c;
235  int K;
236 
237  /* append p^(e-1) to the prime decomposition of p-1 */
238  t = t_p1;
239  if( e > 1 ) {
240  pr_p1[t].p = p;
241  pr_p1[t].e = e-1;
242  pr_p1[t].pe = pe / t;
243  t = t_p1 + 1;
244  }
245 
246  GKK_doublingtable(n, pe, pe-1, dt);
247 
248  for (i=0; i<PRIME_MAXSIZE; i++)
249  fi[i] = pr_p1[i].e;
250 
251  K = pe - pe/p;
252  for(i=0; i<t; i++) {
253  while( fi[i] > 0 ) {
254  K1 = K / pr_p1[i].p;
255  c = GKK_modpow(dt, K1, pe);
256  if( c == 1 ) {
257  K = K1;
258  fi[i] = fi[i] - 1;
259  }
260  else {
261  break;
262  }
263  }
264  }
265  return K;
266 }
267 
318 void GKK_prepare(int q, int n, primedec_t *pr_q, int *t,
319  primedec_t **pr_pi1, int *t_pi1, int *gi,
320  int *Nif, int *Kif, int *dif) {
321  int i, f, p1f, tmp;
322 
323  /* factor q */
324  factor(q, pr_q, t);
325 
326  /* factor pi-1 for each pi in q */
327  for (i=0; i<*t; i++)
328  factor(pr_q[i].p-1, pr_pi1[i], &(t_pi1[i]));
329 
330  /*
331  * Compute the number of positive integers coprime
332  * to f, Ni(f) for f = 1, 2, ..., ei
333  */
334  for (i=0; i<*t; i++) {
335  tmp = PWR_MAXSIZE*i;
336  Nif[tmp] = pr_q[i].p - 1;
337 
338  for(f=1; f < pr_q[i].e; f++)
339  Nif[tmp+f] = pr_q[i].p * Nif[tmp+f-1];
340  }
341 
342  if( pr_q[0].p == 2 ) {
343  for(f=1; f < pr_q[0].e; f++)
344  Nif[f] = Nif[f] / 2;
345 
346  Nif[PWR_MAXSIZE*(*t)] = 1;
347  for(f=1; f < pr_q[0].e; f++)
348  Nif[PWR_MAXSIZE*(*t)+f] = 2;
349  }
350 
351  /*
352  * Case A: odd pi
353  */
354  for(i=0; i <*t; i++) {
355  if( pr_q[i].p != 2 ) {
356  tmp = PWR_MAXSIZE*i;
357 
358  /* compute Ki(ei) */
359  Kif[tmp+pr_q[i].e-1] = GKK_multorder(n, pr_q[i].p, pr_q[i].e, pr_q[i].pe, pr_pi1[i], t_pi1[i]);
360 
361  /* compute Ki(f) for f = 1, 2, ..., ei-1 */
362  for(f = pr_q[i].e-2; f>-1; f--) {
363  if( Kif[tmp+f+1] >= pr_q[i].p )
364  Kif[tmp+f] = Kif[tmp+f+1] / pr_q[i].p;
365  else
366  Kif[tmp+f] = Kif[tmp+f+1];
367  }
368 
369  /* compute di(f) for f = 1, 2, ..., ei */
370  for(f=0; f<pr_q[i].e; f++)
371  dif[tmp+f] = Nif[tmp+f] / Kif[tmp+f];
372 
373  /* compute primitive root gi for pi^ei */
374  if( dif[tmp+pr_q[i].e-1] == 1 )
375  /* special case where n is already a primitive root*/
376  gi[i] = n % pr_q[i].pe;
377  else
378  /* general case requiring a search for a primitive root */
379  gi[i] = GKK_primroot(pr_q[i].p, pr_q[i].e, pr_pi1[i], t_pi1[i]);
380  }
381  }
382 
383  /*
384  * Case B: even pi (only the first one)
385  */
386  if( pr_q[0].p == 2 ) {
387  /* reset primitive root */
388  gi[0] = 0;
389 
390  /* compute d1(f) for f = 1,2,...,e1 */
391  p1f = 2;
392  for (f=0; f<pr_q[0].e; f++) {
393  if( (n%4) == 1)
394  dif[f] = gcd( ( (n%p1f)-1)/4, Nif[f] );
395  else
396  dif[f] = gcd( (p1f-(n%p1f)-1)/4, Nif[f] );
397  p1f = p1f*2;
398  }
399 
400  /* compute K1(f) for f = 1,2,...,e1 */
401  for (f=0; f<pr_q[0].e; f++)
402  Kif[f] = Nif[f] / dif[f];
403 
404  /* compute K0(f) for f = 1,2,...,e1 */
405  tmp = PWR_MAXSIZE * (*t);
406  Kif[tmp] = 1;
407  for (f=1; f<pr_q[0].e; f++) {
408  Kif[tmp+f] = 2;
409  if( (n%4) == 1 )
410  Kif[tmp+f] = 1;
411  }
412 
413  /* compute d0(f) for f = 1,2,...,e1 */
414  for (f=0; f<pr_q[0].e; f++)
415  dif[tmp+f] = Nif[tmp+f] / Kif[tmp+f];
416  }
417 }
418 
462 void GKK_L(int t, primedec_t *pr_q, int *fi, int *Kif, int *dif,
463  int *Li, int *diLi, int *cl, int *nl) {
464  int i, lcl, lnl;
465 
466  /* compute cycle length (cl) and Li */
467  lcl = 1;
468  for (i=0; i<t; i++) {
469  if( fi[i] == 0 ) {
470  Li[i] = 1;
471  }
472  else {
473  Li[i] = gcd( Kif[i*PWR_MAXSIZE+fi[i]-1], lcl);
474  lcl = (lcl * Kif[i*PWR_MAXSIZE+fi[i]-1]) / Li[i];
475  }
476  }
477 
478  if( pr_q[0].p == 2 ) {
479  if( fi[0] == 0 ) {
480  Li[t] = 1;
481  }
482  else {
483  int tmp = Kif[t*PWR_MAXSIZE+fi[0]-1];
484 
485  Li[t] = gcd( tmp, lcl);
486  lcl = (lcl * tmp) / Li[t];
487  }
488  }
489  *cl = lcl;
490 
491  /* count number of cycles (nl) and compute di*Li */
492  lnl = 1;
493  for (i=0; i<t; i++) {
494  if( fi[i] != 0 ) {
495  diLi[i] = dif[i*PWR_MAXSIZE+fi[i]-1] * Li[i];
496  lnl = lnl * diLi[i];
497  }
498  else
499  diLi[i] = 0;
500  }
501  if( pr_q[0].p == 2 ) {
502  if( fi[0] != 0 ) {
503  diLi[t] = dif[t*PWR_MAXSIZE+fi[0]-1] * Li[t];
504  lnl = lnl * diLi[t];
505  }
506  else
507  diLi[t] = 0;
508  }
509  *nl = lnl;
510 
511 #ifdef DEBUG_IPT
512  printf("nl : %d\n", lnl);
513  printf("cl : %d\n", lcl);
514  printf("Li :");
515  for(i=0; i<t+1; i++)
516  printf(" %d", Li[i]);
517  printf("\n");
518  printf("diLi :");
519  for(i=0; i<t+1; i++)
520  printf(" %d", diLi[i]);
521  printf("\n");
522 #endif
523 }
524 
563 void GKK_precompute_terms(int q, primedec_t *pr_q, int t, int *gi,
564  int *diLi, int *rp, int *Mg, int nMg) {
565  int i, j, nMg_needed, ht;
566 
567  ht = t;
568  if( pr_q[0].p == 2 )
569  ht = t + 1;
570 
571  nMg_needed = diLi[0];
572  for(i=1; i<ht; i++)
573  nMg_needed += diLi[i];
574 
575  if( nMg_needed > nMg ) {
576  plasma_error(__func__, "the size of Mg is not large enough");
577  return;
578  }
579 
580  rp[0] = 0;
581  for(i=0; i<t; i++) {
582  if( pr_q[i].p != 2 ) {
583  Mg[rp[i]] = q / pr_q[i].pe;
584 
585  for(j=1; j<diLi[i]; j++)
586  Mg[rp[i]+j] = (Mg[rp[i]+j-1]*gi[i]) % q;
587  }
588  rp[i+1] = rp[i] + diLi[i];
589  }
590  if( pr_q[0].p == 2 ) {
591  Mg[rp[0]] = q / pr_q[0].pe;
592  for(j=1; j<diLi[0]; j++)
593  Mg[rp[0]+j] = (Mg[rp[0]+j-1]*5) % q;
594  }
595 
596 #ifdef DEBUG_IPT
597  printf("t : %d\n", t);
598  printf("ht : %d\n", ht);
599  printf("rp :");
600  for(i=0; i<ht; i++)
601  printf(" %d", rp[i]);
602  printf("\n");
603  printf("Mg :");
604  for(i=0; i<nMg; i++)
605  printf(" %d", Mg[i]);
606  printf("\n");
607 #endif
608 }
609 
636 void GKK_BalanceLoad(int thrdnbr, int *Tp, int *leaders, int nleaders, int L) {
637  int i, j, n1, nel;
638  double sumTi, maxTi;
639 
640  /* quick return */
641  if( thrdnbr == 1 )
642  return;
643 
644  sumTi = (double)sum(thrdnbr, Tp);
645  maxTi = (double)maxval(thrdnbr, Tp);
646 
647  /* check if the load is too imbalanced */
648  if( (1.0 - (sumTi / ((double)thrdnbr * maxTi))) > IMBALANCE_THRESHOLD ) {
649  /* split across cycle */
650  for (i=0; i<nleaders; i+=3) {
651  j = leaders[i+2];
652  nel = leaders[i+1];
653  if( (nel >= thrdnbr) &&
654  ((double)Tp[j] > (sumTi / ((double)thrdnbr * (1.0 - IMBALANCE_THRESHOLD)))) ) {
655  /* split */
656  n1 = (nel + thrdnbr - 1) / thrdnbr;
657  Tp[j] = Tp[j] - nel * L;
658 
659  for (j=0; j<thrdnbr; j++)
660  Tp[j] = Tp[j] + min(n1, nel - n1*(j-1));
661 
662  maxTi = (double)maxval(thrdnbr, Tp);
663  leaders[i+2] = -2;
664  }
665  }
666  }
667 }
668 
707 void GKK_output_tables(int m, int n, int q, primedec_t *pr_q, int t,
708  int *gi, int *Nif, int *Kif, int *dif) {
709  int i, f;
710 
711  fprintf(stdout, "Information from the GKK algorithm\n");
712  fprintf(stdout, "==================================\n");
713  fprintf(stdout, "m = %4d\n", m);
714  fprintf(stdout, "n = %4d\n", n);
715  fprintf(stdout, "q = %4d\n", q);
716  for (i=0; i<t; i++) {
717  fprintf(stdout, "==================================\n");
718  fprintf(stdout, " i = %4d\n", i+1 );
719  fprintf(stdout, " p = %4d\n", pr_q[i].p);
720  fprintf(stdout, " e = %4d\n", pr_q[i].e);
721  fprintf(stdout, " p^e = %4d\n", pr_q[i].pe);
722  if( pr_q[i].p != 2 )
723  fprintf(stdout, " g = %4d\n", gi[i]);
724  else
725  fprintf(stdout, "mod(n,4) = %4d\n", n%4);
726 
727  fprintf(stdout, "\n");
728  fprintf(stdout, " f | ");
729  for (f=0; f<pr_q[i].e; f++)
730  fprintf(stdout, "%4d ", f+1);
731  fprintf(stdout, "\n");
732  fprintf(stdout, "----------------------------------\n");
733 
734  fprintf(stdout, "Ni(f) | ");
735  for (f=0; f<pr_q[i].e; f++)
736  fprintf(stdout, "%4d ", Nif[i*PWR_MAXSIZE+f]);
737  fprintf(stdout, "\n");
738 
739  fprintf(stdout, "Ki(f) | ");
740  for (f=0; f<pr_q[i].e; f++)
741  fprintf(stdout, "%4d ", Kif[i*PWR_MAXSIZE+f]);
742  fprintf(stdout, "\n");
743 
744  fprintf(stdout, "di(f) | ");
745  for (f=0; f<pr_q[i].e; f++)
746  fprintf(stdout, "%4d ", dif[i*PWR_MAXSIZE+f]);
747  fprintf(stdout, "\n");
748  fprintf(stdout, "\n");
749  }
750 }
751 
752 
753 int GKK_getLeaderNbr(int me, int ne, int *nleaders, int **leaders) {
754 
756  primedec_t **pr_pi1 = NULL;
757  int *ptr;
758  int *t_pi1 = NULL; /* Number of primes in prime decomposition of each pi */
759  int *gi = NULL; /* Prime roots of pi^ei */
760  int *fi = NULL; /* Copy of ei-1 because it's the indice in Kif */
761  int *pifi = NULL; /* Copy of pi^ei */
762  int *rp = NULL; /* */
763  int *Li = NULL; /* gcd(Ki, lcm(Kh .. Kt)) with h=0/1 (q even or odd) */
764  int *si = NULL; /* */
765  int *diLi = NULL; /* di * Li */
766  int *Mg = NULL; /* */
767  int *vMg = NULL; /* */
768  int *Kif = NULL; /* */
769  int *Nif = NULL; /* */
770  int *dif = NULL; /* */
771 
772  int i, j, s;
773  int q, t = 0, ndiv, ht, div;
774  int cl, nl, nlead;
775 
776  ptr = (int *)malloc( ((3*PWR_MAXSIZE + 8)*PRIME_MAXSIZE + 4 + 2*SIZE_MG) * sizeof(int) );
777  t_pi1 = ptr; /* plasma_malloc(t_pi1, PRIME_MAXSIZE, int); */
778  gi = t_pi1 + PRIME_MAXSIZE; /* plasma_malloc(gi , PRIME_MAXSIZE, int); */
779  fi = gi + PRIME_MAXSIZE; /* plasma_malloc(fi , PRIME_MAXSIZE, int); */
780  pifi = fi + PRIME_MAXSIZE; /* plasma_malloc(pifi , PRIME_MAXSIZE, int); */
781  rp = pifi + PRIME_MAXSIZE; /* plasma_malloc(rp , PRIME_MAXSIZE+1, int); */
782  Li = rp + PRIME_MAXSIZE+1; /* plasma_malloc(Li , PRIME_MAXSIZE+1, int); */
783  si = Li + PRIME_MAXSIZE+1; /* plasma_malloc(si , PRIME_MAXSIZE+1, int); */
784  diLi = si + PRIME_MAXSIZE+1; /* plasma_malloc(diLi , PRIME_MAXSIZE+1, int); */
785  Kif = diLi + PRIME_MAXSIZE+1; /* plasma_malloc(Kif , PRIME_MAXSIZE*PWR_MAXSIZE, int); */
786  Nif = Kif + PRIME_MAXSIZE*PWR_MAXSIZE; /* plasma_malloc(Nif , PRIME_MAXSIZE*PWR_MAXSIZE, int); */
787  dif = Nif + PRIME_MAXSIZE*PWR_MAXSIZE; /* plasma_malloc(dif , PRIME_MAXSIZE*PWR_MAXSIZE, int); */
788  Mg = dif + PRIME_MAXSIZE*PWR_MAXSIZE; /* plasma_malloc(Mg , SIZE_MG, int); */
789  vMg = Mg + SIZE_MG; /* plasma_malloc(vMg , SIZE_MG, int); */
790 
791  pr_pi1 = (primedec_t**)malloc(PRIME_MAXSIZE*sizeof(primedec_t*));
792  for(i=0; i<PRIME_MAXSIZE; i++) {
793  pr_pi1[i] = (primedec_t*)malloc(PRIME_MAXSIZE*sizeof(primedec_t));
794  }
795  q = me*ne - 1;
796 
797  /* Fill-in pr_q, t, pr_pi1, t_pi1, gi, Nif, Kif and dif */
798  GKK_prepare(q, ne, pr_q, &t, pr_pi1, t_pi1, gi, Nif, Kif, dif);
799 #ifdef DEBUG_IPT
800  GKK_output_tables(me, ne, q, pr_q, t, gi, Nif, Kif, dif);
801 #endif
802 
803  ht = t;
804  if( (q % 2) == 0 )
805  ht = t + 1;
806 
807  ndiv = 1;
808  for(i=0; i<t; i++) {
809  fi[i] = pr_q[i].e;
810  pifi[i] = pr_q[i].pe;
811  ndiv = ndiv * (pr_q[i].e + 1);
812  }
813 
814  /*
815  * First loop to compute number of leaders
816  */
817  /* loop over divisors v of q */
818  nlead = 0;
819  for (div=0; div<ndiv-1; div++) {
820  GKK_L(t, pr_q, fi, Kif, dif, Li, diLi, &cl, &nl);
821 
822  nlead += nl;
823 
824  /* step to next divisor v of q (q/v = product of pifi) */
825  for(i=0; i<t; i++) {
826  if( fi[i] == 0 ) {
827  fi[i] = pr_q[i].e;
828  pifi[i] = pr_q[i].pe;
829  }
830  else {
831  fi[i] = fi[i] - 1;
832  pifi[i] = pifi[i] / pr_q[i].p;
833  break;
834  }
835  }
836  }
837 
838  *nleaders = nlead;
839  /*fprintf(stderr, "Number of leader : %ld => ", (long)nlead); */
840 
841  /*
842  * Second loop to store leaders
843  */
844  (*leaders) = (int*) malloc (nlead*3*sizeof(int));
845 
846  /* Restore fi and pifi */
847  for(i=0; i<t; i++) {
848  fi[i] = pr_q[i].e;
849  pifi[i] = pr_q[i].pe;
850  }
851 
852  /* loop over divisors v of q */
853  nlead = 0;
854  for (div=0; div<ndiv-1; div++) {
855  GKK_L(t, pr_q, fi, Kif, dif, Li, diLi, &cl, &nl);
856 
857  if( div == 0 ) {
858  GKK_precompute_terms(q, pr_q, t, gi, diLi, rp, Mg, SIZE_MG);
859  memcpy(vMg, Mg, SIZE_MG*sizeof(int));
860  }
861  else {
862  for(i=0; i<t; i++) {
863  if( pifi[i] != pr_q[i].pe ) {
864  for(j = rp[i]; j<rp[i]+diLi[i]; j++) {
865  vMg[j] = ((pr_q[i].pe / pifi[i])*Mg[j]) % q;
866  }
867  }
868  else {
869  memcpy(&(vMg[rp[i]]), &(Mg[rp[i]]), diLi[i]*sizeof(int));
870  }
871  }
872  }
873 
874  for (i=0;i<ht; i++)
875  si[i] = 0;
876  while (1) {
877  /* convert current leader from additive to multiplicative domain */
878  s = 0;
879  for (i=0; i<t; i++) {
880  if( diLi[i] == 0 )
881  continue;
882  s = (s + vMg[rp[i] + si[i]]) % q;
883  }
884 
885  /* assign it to owner */
886  (*leaders)[nlead ] = s;
887  (*leaders)[nlead+1] = cl;
888  //leaders[nleaders+2] = owner;
889  nlead += 3;
890 
891  nl = nl - 1;
892  if( nl == 0 )
893  break;
894 
895  /* step to next leader */
896  for(i=0; i<ht; i++) {
897  if( diLi[i] == 0 ) continue;
898  si[i] = si[i] + 1;
899  if( si[i] < diLi[i] ) {
900  if( (i == (ht-1)) && (ht != t) ) {
901  /* flip vMg for i=1 */
902  for(j=0; j<diLi[0]; j++) {
903  vMg[rp[0]+j] = (q-vMg[rp[0]+j]) % q;
904  }
905  }
906  break;
907  }
908  else
909  si[i] = 0;
910  }
911  }
912 
913  /* step to next divisor v of q (q/v = product of pifi) */
914  for(i=0; i<t; i++) {
915  if( fi[i] == 0 ) {
916  fi[i] = pr_q[i].e;
917  pifi[i] = pr_q[i].pe;
918  }
919  else {
920  fi[i] = fi[i] - 1;
921  pifi[i] = pifi[i] / pr_q[i].p;
922  break;
923  }
924  }
925  }
926 
927 #ifdef DEBUG_IPT
928  printf("nleaders : %d\n", *nleaders);
929  printf("leaders : ");
930  for(i=0; i<*nleaders; i++)
931  {
932  printf(" %d", (*leaders)[i*3]);
933  printf(" %d", (*leaders)[i*3+1]);
934  printf(" %d", (*leaders)[i*3+2]);
935  }
936  printf("\n");
937 #endif
938 
939  for(i=0; i<PRIME_MAXSIZE; i++)
940  free(pr_pi1[i]);
941  free(pr_pi1);
942  free(ptr);
943 
944  return PLASMA_SUCCESS;
945 }