001: /* ///////////////////////////////////////////////////////////////////////////////////////
002:  *  -- PLASMA --
003:  *     University of Tennessee
004:  */
005: #include "common.h"
006: 
007: #include <stdio.h>
008: #include <math.h>
009: #include <cblas.h>
010: #include "../src/lapack.h"
011: 
012: /* ///////////////////////////////////////////////////////////////////////////////////////
013:  *  Find the solution of a real system of linear equations using tile
014:  *  LU factorization ZCGESV computes the solution to a complex system
015:  *  of linear equations A * X = B, where A is an N-by-N matrix and X
016:  *  and B are N-by-NRHS matrices.a
017:  *
018:  *  IMPORTANT NOTICE: in its current state, this routine only intends
019:  *  to be a proof-of-concept. There are still some costly serial
020:  *  parts and one may NOT expect to achieve high performance.
021:  *
022:  *  ZCGESV first attempts to factorize the matrix in COMPLEX and use this
023:  *  factorization within an iterative refinement procedure to produce a
024:  *  solution with COMPLEX*16 normwise backward error quality (see below).
025:  *  If the approach fails the method switches to a COMPLEX*16
026:  *  factorization and solve.
027:  *
028:  *  The iterative refinement is not going to be a winning strategy if
029:  *  the ratio COMPLEX performance over COMPLEX*16 performance is too
030:  *  small. A reasonable strategy should take the number of right-hand
031:  *  sides and the size of the matrix into account. This might be done
032:  *  with a call to ILAENV in the future. Up to now, we always try
033:  *  iterative refinement.
034:  *
035:  *  The iterative refinement process is stopped if ITER > ITERMAX or
036:  *  for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
037:  *  where
038:  *  o ITER is the number of the current iteration in the
039:  *    iterative refinement process
040:  *  o RNRM is the infinity-norm of the residual
041:  *  o XNRM is the infinity-norm of the solution
042:  *  o ANRM is the infinity-operator-norm of the matrix A
043:  *  o EPS is the machine epsilon returned by DLAMCH('Epsilon').
044:  *  The values ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.
045:  */
046: int PLASMA_dcgesv(int N, int NRHS, double *A, int LDA, double *L, int *IPIV, double *B, int LDB, double *X, int LDX, int *ITER)
047: {
048:     int NB, NT, NTRHS;
049:     int status;
050: 
051:     PLASMA_Complex32_t *SAbdl;
052:     PLASMA_Complex32_t *SXbdl;
053:     PLASMA_Complex32_t *SLbdl;
054:     double *work;
055:     PLASMA_Complex32_t *swork;
056:     double *rwork;
057:     double *Abdl;
058:     double *Xbdl;
059:     double *Lbdl;
060:     plasma_context_t *plasma;
061: 
062:     const int itermax = 30;
063:     const double bwdmax = 1.0;
064:     const double negone = -1.0;
065:     const double one = 1.0;
066: 
067:     char norm='I';
068:     char all='A';
069: 
070:     int info;
071:     int i, iiter, ptsa, ptsx;
072:     double Anorm, cte, eps, Rnorm, Xnorm;
073: 
074:     *ITER=0;
075: 
076:     plasma = plasma_context_self();
077: 
078:     if (plasma == NULL) {
079:         plasma_fatal_error("PLASMA_dcgesv", "PLASMA not initialized");
080:         return PLASMA_ERR_NOT_INITIALIZED;
081:     }
082:     /* Check input arguments */
083:     if (N < 0) {
084:         plasma_error("PLASMA_dcgesv", "illegal value of N");
085:         return -1;
086:     }
087:     if (NRHS < 0) {
088:         plasma_error("PLASMA_dcgesv", "illegal value of NRHS");
089:         return -2;
090:     }
091:     if (LDA < max(1, N)) {
092:         plasma_error("PLASMA_dcgesv", "illegal value of LDA");
093:         return -4;
094:     }
095:     if (LDB < max(1, N)) {
096:         plasma_error("PLASMA_dcgesv", "illegal value of LDB");
097:         return -8;
098:     }
099:     if (LDX < max(1, N)) {
100:         plasma_error("PLASMA_dcgesv", "illegal value of LDX");
101:         return -10;
102:     }
103:     /* Quick return */
104:     if (min(N, NRHS) == 0)
105:         return PLASMA_SUCCESS;
106: 
107:     /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
108:     status = plasma_tune(PLASMA_TUNE_ZCGESV, N, N, NRHS);
109:     if (status != PLASMA_SUCCESS) {
110:         plasma_error("PLASMA_dcgesv", "plasma_tune() failed");
111:         return status;
112:     }
113: 
114:     /* Set NT & NTRHS */
115:     NB = PLASMA_NB;
116:     NT = (N%NB==0) ? (N/NB) : (N/NB+1);
117:     NTRHS = (NRHS%NB==0) ? (NRHS/NB) : (NRHS/NB+1);
118: 
119:     /* Allocate memory for working arrays */
120:     work = (double *)plasma_shared_alloc(plasma, N*NRHS, PlasmaRealDouble);
121:     swork = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, N*(N+NRHS), PlasmaComplexFloat);
122:     rwork = (double *)plasma_shared_alloc(plasma, N, PlasmaRealDouble);
123:     /* Allocate memory for matrices in block layout */
124:     SAbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaComplexFloat);
125:     SLbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
126:     SXbdl = (PLASMA_Complex32_t *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaComplexFloat);
127:     if (work == NULL || swork == NULL || rwork == NULL || SAbdl == NULL || SLbdl == NULL || SXbdl == NULL) {
128:         plasma_error("PLASMA_dcgesv", "plasma_shared_alloc() failed");
129:         plasma_shared_free(plasma, work);
130:         plasma_shared_free(plasma, swork);
131:         plasma_shared_free(plasma, rwork);
132:         plasma_shared_free(plasma, SAbdl);
133:         plasma_shared_free(plasma, SLbdl);
134:         plasma_shared_free(plasma, SXbdl);
135:         return PLASMA_ERR_OUT_OF_RESOURCES;
136:     }
137: 
138:     /* Compute some constants */
139:     Anorm = dlange(&norm, &N, &N, A, &LDA, rwork);
140:     eps = dlamch("Epsilon");
141:     cte = Anorm*eps*sqrt((double) N)*bwdmax;
142: 
143:     /* Set the indices ptsa, ptsx for referencing sa and sx in swork. */
144: 
145:     ptsa = 0;
146:     ptsx = ptsa + N*N;
147: 
148:     /* Convert B from double precision to single precision and store
149:        the result in SX. */
150: 
151:     zlag2c(&N, &NRHS, B, &LDB, swork+ptsx, &N, &info);
152: 
153:     /* Convert A from double precision to single precision and store
154:        the result in SA. */
155: 
156:     zlag2c(&N, &N, A, &LDA, swork+ptsa, &N, &info);
157: 
158:     PLASMA_desc descSA = plasma_desc_init(
159:         SAbdl, PlasmaComplexFloat,
160:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
161:         N, N, 0, 0, N, N);
162: 
163:     PLASMA_desc descSX = plasma_desc_init(
164:         SXbdl, PlasmaComplexFloat,
165:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
166:         N, NRHS, 0, 0, N, NRHS);
167: 
168:     PLASMA_desc descSL = plasma_desc_init(
169:         SLbdl, PlasmaComplexFloat,
170:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
171:         N, N, 0, 0, N, N);
172: 
173:     plasma_parallel_call_3(plasma_lapack_to_tile,
174:         PLASMA_Complex32_t*, swork+ptsa,
175:         int, N,
176:         PLASMA_desc, descSA);
177: 
178:     plasma_parallel_call_3(plasma_lapack_to_tile,
179:         PLASMA_Complex32_t*, swork+ptsx,
180:         int, N,
181:         PLASMA_desc, descSX);
182: 
183:     /* Clear IPIV and Lbdl */
184:     plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
185:     plasma_memzero(SLbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaComplexFloat);
186: 
187:     /* Set INFO to ZERO */
188:     PLASMA_INFO = PLASMA_SUCCESS;
189: 
190:     /* Compute the LU factorization of SA */
191:     plasma_parallel_call_3(plasma_pcgetrf,
192:         PLASMA_desc, descSA,
193:         PLASMA_desc, descSL,
194:         int*, IPIV);
195: 
196:     if (PLASMA_INFO == PLASMA_SUCCESS)
197:     {
198:         /* Solve the system SA*SX = SB */
199: 
200:         /* Forward substitution */
201:         plasma_parallel_call_4(plasma_pctrsmpl,
202:             PLASMA_desc, descSA,
203:             PLASMA_desc, descSX,
204:             PLASMA_desc, descSL,
205:             int*, IPIV);
206: 
207:         /* Backward substitution */
208:         plasma_parallel_call_7(plasma_pctrsm,
209:             PLASMA_enum, PlasmaLeft,
210:             PLASMA_enum, PlasmaUpper,
211:             PLASMA_enum, PlasmaNoTrans,
212:             PLASMA_enum, PlasmaNonUnit,
213:             PLASMA_Complex32_t, 1.0,
214:             PLASMA_desc, descSA,
215:             PLASMA_desc, descSX);
216: 
217:         /* Come back to lapack layout for A */
218:         plasma_parallel_call_3(plasma_tile_to_lapack,
219:             PLASMA_desc, descSA,
220:             PLASMA_Complex32_t*, swork+ptsa,
221:             int, N);
222: 
223:         /* Come back to lapack layout for X */
224:         plasma_parallel_call_3(plasma_tile_to_lapack,
225:             PLASMA_desc, descSX,
226:             PLASMA_Complex32_t*, swork+ptsx,
227:             int, N);
228:     }
229: 
230:     /* Convert SX back to double precision */
231:     clag2z( &N, &NRHS, swork+ptsx, &N, X, &LDX, &info );
232: 
233:     /* Compute R = B - AX (R is WORK). */
234:     dlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
235:     cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, (negone), A, LDA, X, LDX, (one), work, N);
236: 
237:     /* Check whether the NRHS normwise backward errors satisfy the
238:        stopping criterion. If yes return. Note that ITER=0 (already set). */
239:     Xnorm = dlange(&norm, &N, &NRHS, X, &LDX, rwork);
240:     Rnorm = dlange(&norm, &N, &NRHS, work, &N, rwork);
241:     if (Rnorm < Xnorm * cte){
242:       /* The NRHS normwise backward errors satisfy the
243:          stopping criterion. We are good to exit. */
244:       return PLASMA_INFO;;
245:     }
246: 
247:     for (iiter = 0; iiter < itermax; iiter++){
248: 
249:       /* Convert R (in WORK) from double precision to single precision
250:          and store the result in SX. */
251:       zlag2c( &N, &NRHS, work, &N, swork+ptsx, &N, &info );
252: 
253:       /* Set X to bdl layout again */
254:       plasma_parallel_call_3(plasma_lapack_to_tile,
255:         PLASMA_Complex32_t*, swork+ptsx,
256:         int, N,
257:         PLASMA_desc, descSX);
258: 
259:       /* Solve the system SA*SX = SB */
260: 
261:       /* Forward substitution */
262:       plasma_parallel_call_4(plasma_pctrsmpl,
263:             PLASMA_desc, descSA,
264:             PLASMA_desc, descSX,
265:             PLASMA_desc, descSL,
266:             int*, IPIV);
267: 
268:       /* Backward substitution */
269:       plasma_parallel_call_7(plasma_pctrsm,
270:             PLASMA_enum, PlasmaLeft,
271:             PLASMA_enum, PlasmaUpper,
272:             PLASMA_enum, PlasmaNoTrans,
273:             PLASMA_enum, PlasmaNonUnit,
274:             PLASMA_Complex32_t, 1.0,
275:             PLASMA_desc, descSA,
276:             PLASMA_desc, descSX);
277: 
278:       /* Come back to lapack layout for X */
279:       plasma_parallel_call_3(plasma_tile_to_lapack,
280:             PLASMA_desc, descSX,
281:             PLASMA_Complex32_t*, swork+ptsx,
282:             int, N);
283: 
284:       /* Convert SX back to double precision and update the current
285:          iterate. */
286:       clag2z( &N, &NRHS, swork+ptsx, &N, work, &N, &info );
287: 
288:       for (i = 0; i < NRHS; i++){
289:         cblas_daxpy(N, (one), work+i*N, 1, X+i*LDX, 1);
290:       }
291: 
292:       /* Compute R = B - AX (R is WORK). */
293:       dlacpy( &all, &N, &NRHS, B, &LDB, work, &N);
294:       cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, NRHS, N, (negone), A, LDA, X, LDX, (one), work, N);
295: 
296:       /* Check whether the NRHS normwise backward errors satisfy the
297:          stopping criterion. If yes, set ITER=IITER>0 and return. */
298:       Xnorm = dlange(&norm, &N, &NRHS, X, &LDX, rwork);
299:       Rnorm = dlange(&norm, &N, &NRHS, work, &N, rwork);
300:       if (Rnorm < Xnorm * cte){
301:         /* The NRHS normwise backward errors satisfy the
302:            stopping criterion. We are good to exit. */
303:         *ITER = iiter;
304:         plasma_shared_free(plasma, SAbdl);
305:         plasma_shared_free(plasma, SLbdl);
306:         plasma_shared_free(plasma, SXbdl);
307:         plasma_shared_free(plasma, work);
308:         plasma_shared_free(plasma, swork);
309:         plasma_shared_free(plasma, rwork);
310:         return PLASMA_INFO;
311:       }
312: 
313:     }
314: 
315:     /* We have performed ITER=itermax iterations and never satisified
316:        the stopping criterion, set up the ITER flag accordingly and
317:        follow up on double precision routine. */
318: 
319:     *ITER = -itermax - 1;
320: 
321:     plasma_shared_free(plasma, SAbdl);
322:     plasma_shared_free(plasma, SLbdl);
323:     plasma_shared_free(plasma, SXbdl);
324:     plasma_shared_free(plasma, work);
325:     plasma_shared_free(plasma, swork);
326:     plasma_shared_free(plasma, rwork);
327: 
328:     /* Single-precision iterative refinement failed to converge to a
329:        satisfactory solution, so we resort to double precision. */
330: 
331:     /* Allocate memory for matrices in block layout */
332:     Abdl = (double *)plasma_shared_alloc(plasma, NT*NT*PLASMA_NBNBSIZE, PlasmaRealDouble);
333:     Lbdl = (double *)plasma_shared_alloc(plasma, NT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
334:     Xbdl = (double *)plasma_shared_alloc(plasma, NT*NTRHS*PLASMA_NBNBSIZE, PlasmaRealDouble);
335:     if (Abdl == NULL || Lbdl == NULL || Xbdl == NULL) {
336:         plasma_error("PLASMA_dcgesv", "plasma_shared_alloc() failed");
337:         plasma_shared_free(plasma, Abdl);
338:         plasma_shared_free(plasma, Lbdl);
339:         plasma_shared_free(plasma, Xbdl);
340:         return PLASMA_ERR_OUT_OF_RESOURCES;
341:     }
342: 
343:     dlacpy( &all, &N, &NRHS, B, &LDB, X, &LDX );
344: 
345:     PLASMA_desc descA = plasma_desc_init(
346:         Abdl, PlasmaRealDouble,
347:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
348:         N, N, 0, 0, N, N);
349: 
350:     PLASMA_desc descX = plasma_desc_init(
351:         Xbdl, PlasmaRealDouble,
352:         PLASMA_NB, PLASMA_NB, PLASMA_NBNBSIZE,
353:         N, NRHS, 0, 0, N, NRHS);
354: 
355:     PLASMA_desc descL = plasma_desc_init(
356:         Lbdl, PlasmaRealDouble,
357:         PLASMA_IB, PLASMA_NB, PLASMA_IBNBSIZE,
358:         N, N, 0, 0, N, N);
359: 
360:     plasma_parallel_call_3(plasma_lapack_to_tile,
361:         double*, A,
362:         int, LDA,
363:         PLASMA_desc, descA);
364: 
365:     plasma_parallel_call_3(plasma_lapack_to_tile,
366:         double*, X,
367:         int, LDX,
368:         PLASMA_desc, descX);
369: 
370:     /* Clear IPIV and Lbdl */
371:     plasma_memzero(IPIV, NT*NT*PLASMA_NB, PlasmaInteger);
372:     plasma_memzero(Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
373: 
374:     /* Set INFO to ZERO */
375:     PLASMA_INFO = PLASMA_SUCCESS;
376: 
377:     plasma_parallel_call_3(plasma_pdgetrf,
378:         PLASMA_desc, descA,
379:         PLASMA_desc, descL,
380:         int*, IPIV);
381: 
382:     /* Return L to the user */
383:     plasma_memcpy(L, Lbdl, NT*NT*PLASMA_IBNBSIZE, PlasmaRealDouble);
384: 
385:     if (PLASMA_INFO == PLASMA_SUCCESS)
386:     {
387:         plasma_parallel_call_4(plasma_pdtrsmpl,
388:             PLASMA_desc, descA,
389:             PLASMA_desc, descX,
390:             PLASMA_desc, descL,
391:             int*, IPIV);
392: 
393:         plasma_parallel_call_7(plasma_pdtrsm,
394:             PLASMA_enum, PlasmaLeft,
395:             PLASMA_enum, PlasmaUpper,
396:             PLASMA_enum, PlasmaNoTrans,
397:             PLASMA_enum, PlasmaNonUnit,
398:             double, 1.0,
399:             PLASMA_desc, descA,
400:             PLASMA_desc, descX);
401: 
402:         plasma_parallel_call_3(plasma_tile_to_lapack,
403:             PLASMA_desc, descA,
404:             double*, A,
405:             int, LDA);
406: 
407:         plasma_parallel_call_3(plasma_tile_to_lapack,
408:             PLASMA_desc, descX,
409:             double*, X,
410:             int, LDX);
411:     }
412: 
413:     plasma_shared_free(plasma, Abdl);
414:     plasma_shared_free(plasma, Lbdl);
415:     plasma_shared_free(plasma, Xbdl);
416:     return PLASMA_INFO;
417: 
418: }