PLASMA  2.4.5
PLASMA - Parallel Linear Algebra for Scalable Multi-core Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
time_sgetri_tile.c
Go to the documentation of this file.
1 
6 #define _TYPE float
7 #define _PREC float
8 #define _LAMCH LAPACKE_slamch_work
9 
10 #define _NAME "PLASMA_sgetri_Tile"
11 /* See Lawn 41 page 120 */
12 #define _FMULS (FMULS_GETRF(M, N) + FMULS_GETRI( N ))
13 #define _FADDS (FADDS_GETRF(M, N) + FADDS_GETRI( N ))
14 
15 //#define GETRI_SYNC
16 
17 #include "./timing.c"
18 
19 /*------------------------------------------------------------------------
20  * Check the factorization of the matrix A2
21  */
22 #if 0
23 static int check_getri_factorization(PLASMA_desc *descA1, PLASMA_desc *descA2, int *IPIV)
24 {
25  int info_factorization;
26  float Rnorm, Anorm, Xnorm, Bnorm, result;
27  float *work = (float *)malloc((descA1->m)*sizeof(float));
28  float eps = LAPACKE_slamch_work('e');
29  PLASMA_desc *descB, *descX;
30  float *b = (float *)malloc((descA1->m)*sizeof(float));
31  float *x = (float *)malloc((descA1->m)*sizeof(float));
32 
33  PLASMA_Desc_Create(&descB, b, PlasmaRealFloat, descA1->mb, descA1->nb, descA1->bsiz,
34  descA1->m, 1, 0, 0, descA1->m, 1);
35  PLASMA_Desc_Create(&descX, x, PlasmaRealFloat, descA1->mb, descA1->nb, descA1->bsiz,
36  descA1->m, 1, 0, 0, descA1->m, 1);
37 
38  PLASMA_splrnt_Tile( descX, 537 );
39  PLASMA_slacpy_Tile( PlasmaUpperLower, descX, descB);
40 
41  PLASMA_sgetrs_Tile( PlasmaNoTrans, descA2, IPIV, descX );
42 
43  Xnorm = PLASMA_slange_Tile(PlasmaInfNorm, descX, work);
44  Anorm = PLASMA_slange_Tile(PlasmaInfNorm, descA1, work);
45  Bnorm = PLASMA_slange_Tile(PlasmaInfNorm, descB, work);
46 
48  (float)1., descA1, descX,
49  (float)-1., descB);
50 
51  Rnorm = PLASMA_slange_Tile(PlasmaInfNorm, descB, work);
52 
53  if (getenv("PLASMA_TESTING_VERBOSE"))
54  printf( "||A||_oo=%f\n||X||_oo=%f\n||B||_oo=%f\n||A X - B||_oo=%e\n", Anorm, Xnorm, Bnorm, Rnorm );
55 
56  result = Rnorm / ( (Anorm*Xnorm+Bnorm)*(descA1->m)*eps ) ;
57  printf("============\n");
58  printf("Checking the Residual of the solution \n");
59  printf("-- ||Ax-B||_oo/((||A||_oo||x||_oo+||B||_oo).N.eps) = %e \n", result);
60 
61  if ( isnan(Xnorm) || isinf(Xnorm) || isnan(result) || isinf(result) || (result > 60.0) ) {
62  printf("-- The factorization is suspicious ! \n");
63  info_factorization = 1;
64  }
65  else{
66  printf("-- The factorization is CORRECT ! \n");
67  info_factorization = 0;
68  }
69  free(x); free(b); free(work);
70  PLASMA_Desc_Destroy(&descB);
71  PLASMA_Desc_Destroy(&descX);
72 
73  return info_factorization;
74 }
75 #endif
76 
77 /*------------------------------------------------------------------------
78  * Check the accuracy of the computed inverse
79  */
80 
81 static int check_getri_inverse(PLASMA_desc *descA1, PLASMA_desc *descA2, int *IPIV, float *dparam )
82 {
83  float Rnorm, Anorm, Ainvnorm, result;
84  float *W = (float *)malloc(descA1->n*sizeof(float));
85  float *work = (float *)malloc(descA1->n*descA1->n*sizeof(float));
86  float eps = LAPACKE_slamch_work('e');
87  PLASMA_desc *descW;
88 
89  PLASMA_Desc_Create(&descW, work, PlasmaRealFloat, descA1->mb, descA1->nb, descA1->bsiz,
90  descA1->m, descA1->n, 0, 0, descA1->m, descA1->n);
91 
92  PLASMA_slaset_Tile( PlasmaUpperLower, (float)0., (float)1., descW);
94  (float)-1., descA2, descA1,
95  (float)1., descW);
96 
97  Anorm = PLASMA_slange_Tile(PlasmaInfNorm, descA1, W);
98  Ainvnorm = PLASMA_slange_Tile(PlasmaInfNorm, descA2, W);
99  Rnorm = PLASMA_slange_Tile(PlasmaInfNorm, descW, W);
100 
101  dparam[IPARAM_ANORM] = Anorm;
102  dparam[IPARAM_BNORM] = Ainvnorm;
103 
104  result = Rnorm / ( (Anorm*Ainvnorm)*descA1->m*eps ) ;
105  dparam[IPARAM_RES] = Rnorm;
106 
107  if ( isnan(Ainvnorm) || isinf(Ainvnorm) || isnan(result) || isinf(result) || (result > 60.0) ) {
108  dparam[IPARAM_XNORM] = -1.;
109  }
110  else{
111  dparam[IPARAM_XNORM] = 0.;
112  }
113 
114  PLASMA_Desc_Destroy(&descW);
115  free(W);
116  free(work);
117 
118  return PLASMA_SUCCESS;
119 }
120 
121 static int
122 RunTest(int *iparam, float *dparam, real_Double_t *t_)
123 {
124  PLASMA_desc descW;
125  int ret = 0;
126  PASTE_CODE_IPARAM_LOCALS( iparam );
127 
128  if ( M != N ) {
129  fprintf(stderr, "This timing works only with M == N\n");
130  return -1;
131  }
132 
133  /* Allocate Data */
134  PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, float, PlasmaRealFloat, LDA, N, N );
135  PASTE_CODE_ALLOCATE_MATRIX_TILE( descA2, check, float, PlasmaRealFloat, LDA, N, N );
136  PASTE_CODE_ALLOCATE_MATRIX( piv, 1, int, N, 1 );
137 
139  PLASMA_splrnt_Tile( descA, 3453 );
140 
141  if ( check ) {
142  PLASMA_slacpy_Tile( PlasmaUpperLower, descA, descA2 );
143  }
144 
145  /* PLASMA SGETRF / STRTRI / STRSMRV */
146  {
147 #if defined(TRACE_BY_SEQUENCE)
148  PLASMA_sequence *sequence[4];
152  PLASMA_REQUEST_INITIALIZER };
153 
154  PLASMA_Sequence_Create(&sequence[0]);
155  PLASMA_Sequence_Create(&sequence[1]);
156  PLASMA_Sequence_Create(&sequence[2]);
157  PLASMA_Sequence_Create(&sequence[3]);
158 
159  if ( ! iparam[IPARAM_ASYNC] ) {
160 
161  START_TIMING();
162  PLASMA_sgetrf_Tile_Async(descA, piv, sequence[0], &request[0]);
163  PLASMA_Sequence_Wait(sequence[0]);
164 
165  PLASMA_strtri_Tile_Async(PlasmaUpper, PlasmaNonUnit, descA, sequence[1], &request[1]);
166  PLASMA_Sequence_Wait(sequence[1]);
167 
169  (float) 1.0, descA, &descW, sequence[2], &request[2]);
170  PLASMA_Sequence_Wait(sequence[2]);
171 
172  PLASMA_slaswpc_Tile_Async(descA, 1, descA->m, piv, -1, sequence[3], &request[3]);
173  PLASMA_Sequence_Wait(sequence[3]);
174  STOP_TIMING();
175 
176  } else {
177 
178  START_TIMING();
179  PLASMA_sgetrf_Tile_Async( descA, piv, sequence[0], &request[0]);
181  descA, sequence[1], &request[1]);
183  (float) 1.0,
184  descA, &descW, sequence[2], &request[2]);
185  PLASMA_slaswpc_Tile_Async(descA, 1, descA->m, piv, -1,
186  sequence[3], &request[3]);
187 
188  /* Wait for everything */
189  PLASMA_Sequence_Wait(sequence[0]);
190  PLASMA_Sequence_Wait(sequence[1]);
191  PLASMA_Sequence_Wait(sequence[2]);
192  PLASMA_Sequence_Wait(sequence[3]);
193  STOP_TIMING();
194 
195  }
196 
197  PLASMA_Sequence_Destroy(sequence[0]);
198  PLASMA_Sequence_Destroy(sequence[1]);
199  PLASMA_Sequence_Destroy(sequence[2]);
200  PLASMA_Sequence_Destroy(sequence[3]);
201 
202 #else
203  if ( ! iparam[IPARAM_ASYNC] ) {
204 
205  START_TIMING();
206  PLASMA_sgetrf_Tile(descA, piv);
209  (float) 1.0, descA, &descW);
210  PLASMA_slaswpc_Tile(descA, 1, descA->m, piv, -1);
211  STOP_TIMING();
212 
213  } else {
214 
215  PLASMA_sequence *sequence;
218 
219  PLASMA_Sequence_Create(&sequence);
220 
221  START_TIMING();
222  PLASMA_sgetrf_Tile_Async(descA, piv, sequence, &request[0]);
223  PLASMA_sgetri_Tile_Async(descA, piv, &descW, sequence, &request[1]);
224  PLASMA_Sequence_Wait(sequence);
225  STOP_TIMING();
226 
227  PLASMA_Sequence_Destroy(sequence);
228  }
229 #endif
230  }
231 
232  /* Check the solution */
233  if ( check )
234  {
235  ret = check_getri_inverse(descA2, descA, piv, dparam);
236 
237  PASTE_CODE_FREE_MATRIX( descA2 );
238  }
239 
240  PASTE_CODE_FREE_MATRIX( descA );
241  free(descW.mat);
242 
243  return ret;
244 }
245