Actual source code: fnexp.c

slepc-3.16.2 2022-02-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14: #include <slepc/private/fnimpl.h>
 15: #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 20:   *y = PetscExpScalar(x);
 21:   return(0);
 22: }

 24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 25: {
 27:   *y = PetscExpScalar(x);
 28:   return(0);
 29: }

 31: #define MAX_PADE 6
 32: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 35: {
 36:   PetscErrorCode    ierr;
 37:   PetscBLASInt      n=0,ld,ld2,*ipiv,info,inc=1;
 38:   PetscInt          m,j,k,sexp;
 39:   PetscBool         odd;
 40:   const PetscInt    p=MAX_PADE;
 41:   PetscReal         c[MAX_PADE+1],s,*rwork;
 42:   PetscScalar       scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 43:   PetscScalar       *Ba,*As,*A2,*Q,*P,*W,*aux;
 44:   const PetscScalar *Aa;

 47:   MatDenseGetArrayRead(A,&Aa);
 48:   MatDenseGetArray(B,&Ba);
 49:   MatGetSize(A,&m,NULL);
 50:   PetscBLASIntCast(m,&n);
 51:   ld  = n;
 52:   ld2 = ld*ld;
 53:   P   = Ba;
 54:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 55:   PetscArraycpy(As,Aa,ld2);

 57:   /* Pade' coefficients */
 58:   c[0] = 1.0;
 59:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 61:   /* Scaling */
 62:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 63:   PetscLogFlops(1.0*n*n);
 64:   if (s>0.5) {
 65:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 66:     scale = PetscPowRealInt(2.0,-sexp);
 67:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 68:     PetscLogFlops(1.0*n*n);
 69:   } else sexp = 0;

 71:   /* Horner evaluation */
 72:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 73:   PetscLogFlops(2.0*n*n*n);
 74:   PetscArrayzero(Q,ld2);
 75:   PetscArrayzero(P,ld2);
 76:   for (j=0;j<n;j++) {
 77:     Q[j+j*ld] = c[p];
 78:     P[j+j*ld] = c[p-1];
 79:   }

 81:   odd = PETSC_TRUE;
 82:   for (k=p-1;k>0;k--) {
 83:     if (odd) {
 84:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 85:       SWAP(Q,W,aux);
 86:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 87:       odd = PETSC_FALSE;
 88:     } else {
 89:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 90:       SWAP(P,W,aux);
 91:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 92:       odd = PETSC_TRUE;
 93:     }
 94:     PetscLogFlops(2.0*n*n*n);
 95:   }
 96:   /*if (odd) {
 97:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
 98:     SWAP(Q,W,aux);
 99:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
100:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
101:     SlepcCheckLapackInfo("gesv",info);
102:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
103:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
104:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
105:   } else {*/
106:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
107:     SWAP(P,W,aux);
108:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
109:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
110:     SlepcCheckLapackInfo("gesv",info);
111:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
112:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
113:   /*}*/
114:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

116:   for (k=1;k<=sexp;k++) {
117:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
118:     PetscArraycpy(P,W,ld2);
119:   }
120:   if (P!=Ba) { PetscArraycpy(Ba,P,ld2); }
121:   PetscLogFlops(2.0*n*n*n*sexp);

123:   PetscFree6(Q,W,As,A2,rwork,ipiv);
124:   MatDenseRestoreArrayRead(A,&Aa);
125:   MatDenseRestoreArray(B,&Ba);
126:   return(0);
127: }

129: /*
130:  * Set scaling factor (s) and Pade degree (k,m)
131:  */
132: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
133: {
135:   if (nrm>1) {
136:     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
137:     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
138:     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
139:     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
140:     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
141:     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
142:     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
143:     else               {*s = 1; *k = 1; *m = *k+1;}
144:   } else { /* nrm<1 */
145:     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
146:     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
147:     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
148:     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
149:     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
150:     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
151:     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
152:     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
153:     else                {*s = 0; *k = 1; *m = 0;}
154:   }
155:   return(0);
156: }

158: #if defined(PETSC_HAVE_COMPLEX)
159: /*
160:  * Partial fraction form coefficients.
161:  * If query, the function returns the size necessary to store the coefficients.
162:  */
163: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
164: {
165:   PetscInt i;
166:   const PetscComplex /* m == k+1 */
167:     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
168:                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
169:                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
170:                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
171:                -2.733432894659307e+02                                },
172:     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
173:                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
174:                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
175:                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
176:                 6.286704751729261e+00                               },
177:     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
178:                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
179:                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
180:                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
181:     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
182:                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
183:                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
184:                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
185:     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
186:                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
187:                -1.829749817484586e+01                                },
188:     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
189:                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
190:                 3.637834252744491e+00                                },
191:     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
192:                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
193:     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
194:                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
195:   const PetscComplex /* m == k-1 */
196:     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
197:                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
198:                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
199:                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
200:     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
201:                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
202:                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
203:                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
204:     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
205:                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
206:                -1.734353918633177e+02                                },
207:     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
208:                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
209:                 5.648485971016893e+00                                },
210:     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
211:                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
212:     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
213:                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
214:   const PetscScalar /* m == k-1 */
215:     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
216:     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
217:     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
218:     m1remain2[2] = {-0.5,                   -3.5},
219:     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
220:     remain2[3] = {1.0/2.0, 1, 1};

223:   if (query) { /* query about buffer's size */
224:     if (m==k+1) {
225:       *remain = 0;
226:       *r = *q = k+1;
227:       return(0); /* quick return */
228:     }
229:     if (m==k-1) {
230:       *remain = 2;
231:       if (k==5) *r = *q = 4;
232:       else if (k==4) *r = *q = 3;
233:       else if (k==3) *r = *q = 2;
234:       else if (k==2) *r = *q = 1;
235:     }
236:     if (m==0) {
237:       *r = *q = 0;
238:       *remain = k+1;
239:     }
240:   } else {
241:     if (m==k+1) {
242:       if (k==4) {
243:         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
244:       } else if (k==3) {
245:         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
246:       } else if (k==2) {
247:         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
248:       } else if (k==1) {
249:         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
250:       }
251:       return(0); /* quick return */
252:     }
253:     if (m==k-1) {
254:       if (k==5) {
255:         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
256:         for (i=0;i<2;i++) remain[i] = m1remain5[i];
257:       } else if (k==4) {
258:         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
259:         for (i=0;i<2;i++) remain[i] = m1remain4[i];
260:       } else if (k==3) {
261:         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
262:       } else if (k==2) {
263:         r[0] = -13.5; q[0] = 3;
264:         for (i=0;i<2;i++) remain[i] = m1remain2[i];
265:       }
266:     }
267:     if (m==0) {
268:       r = q = 0;
269:       if (k==3) {
270:         for (i=0;i<4;i++) remain[i] = remain3[i];
271:       } else if (k==2) {
272:         for (i=0;i<3;i++) remain[i] = remain2[i];
273:       }
274:     }
275:   }
276:   return(0);
277: }

279: /*
280:  * Product form coefficients.
281:  * If query, the function returns the size necessary to store the coefficients.
282:  */
283: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
284: {
285:   PetscInt i;
286:   const PetscComplex /* m == k+1 */
287:   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
288:              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
289:              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
290:              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
291:   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
292:               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
293:               6.286704751729261e+00                                ,
294:               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
295:               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
296:   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
297:              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
298:              -5.648485971016893e+00                                },
299:   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
300:               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
301:               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
302:               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
303:   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
304:              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
305:   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
306:               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
307:               3.637834252744491e+00                               },
308:   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
309:               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
310:   const PetscComplex /* m == k-1 */
311:   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
312:              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
313:              -6.286704751729261e+00                                ,
314:              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
315:              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
316:   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
317:               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
318:               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
319:               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
320:   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
321:              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
322:              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
323:              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
324:   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
325:               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
326:               5.648485971016893e+00                                },
327:   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
328:              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
329:              -3.637834252744491e+00                                },
330:   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
331:               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
332:   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
333:              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};

336:   if (query) {
337:     if (m == k+1) {
338:       *mult = 1;
339:       *p = k;
340:       *q = k+1;
341:       return(0);
342:     }
343:     if (m==k-1) {
344:       *mult = 1;
345:       *p = k;
346:       *q = k-1;
347:     }
348:   } else {
349:     if (m == k+1) {
350:       *mult = PetscPowInt(-1,m);
351:       *mult *= m;
352:       if (k==4) {
353:         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
354:         q[4] = p1q4[4];
355:       } else if (k==3) {
356:         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
357:         q[3] = p1q3[3];
358:       } else if (k==2) {
359:         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
360:         q[2] = p1q2[2];
361:       } else if (k==1) {
362:         p[0] = -3;
363:         for (i=0;i<2;i++) q[i] = p1q1[i];
364:       }
365:       return(0);
366:     }
367:     if (m==k-1) {
368:       *mult = PetscPowInt(-1,m);
369:       *mult /= k;
370:       if (k==5) {
371:         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
372:         p[4] = m1p5[4];
373:       } else if (k==4) {
374:         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
375:         p[3] = m1p4[3];
376:       } else if (k==3) {
377:         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
378:         p[2] = m1p3[2];
379:       } else if (k==2) {
380:         for (i=0;i<2;i++) p[i] = m1p2[i];
381:         q[0] = 3;
382:       }
383:     }
384:   }
385:   return(0);
386: }
387: #endif /* PETSC_HAVE_COMPLEX */

389: #if defined(PETSC_USE_COMPLEX)
390: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
391: {
392:   PetscInt i;

395:   *result=PETSC_TRUE;
396:   for (i=0;i<n&&*result;i++) {
397:     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
398:   }
399:   return(0);
400: }
401: #endif

403: /*
404:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
405:  * and Yuji Nakatsukasa
406:  *
407:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
408:  *     Approximation for the Matrix Exponential",
409:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
410:  *     https://doi.org/10.1137/15M1027553
411:  */
412: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
413: {
414: #if !defined(PETSC_HAVE_COMPLEX)
416:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
417: #else
418:   PetscInt          i,j,n_,s,k,m,mod;
419:   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
420:   PetscReal         nrm,shift=0.0;
421: #if defined(PETSC_USE_COMPLEX)
422:   PetscReal         *rwork=NULL;
423: #endif
424:   PetscComplex      *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
425:   PetscScalar       *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
426:   const PetscScalar *Aa;
427:   PetscErrorCode    ierr;
428:   PetscBool         isreal,flg;
429:   PetscBLASInt      query=-1;
430:   PetscScalar       work1,*work;

433:   MatGetSize(A,&n_,NULL);
434:   PetscBLASIntCast(n_,&n);
435:   MatDenseGetArrayRead(A,&Aa);
436:   MatDenseGetArray(B,&Ba);
437:   Ba2 = Ba;
438:   PetscBLASIntCast(n*n,&n2);

440:   PetscMalloc2(n2,&sMaux,n2,&Maux);
441:   Maux2 = Maux;
442:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
443:   if (!flg) {
444:     PetscMalloc2(n,&wr,n,&wi);
445:     PetscArraycpy(sMaux,Aa,n2);
446:     /* estimate rightmost eigenvalue and shift A with it */
447: #if !defined(PETSC_USE_COMPLEX)
448:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
449:     SlepcCheckLapackInfo("geev",info);
450:     PetscBLASIntCast((PetscInt)work1,&lwork);
451:     PetscMalloc1(lwork,&work);
452:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
453:     PetscFree(work);
454: #else
455:     PetscArraycpy(Maux,Aa,n2);
456:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
457:     SlepcCheckLapackInfo("geev",info);
458:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
459:     PetscMalloc2(2*n,&rwork,lwork,&work);
460:     PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
461:     PetscFree2(rwork,work);
462: #endif
463:     SlepcCheckLapackInfo("geev",info);
464:     PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

466:     shift = PetscRealPart(wr[0]);
467:     for (i=1;i<n;i++) {
468:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
469:     }
470:     PetscFree2(wr,wi);
471:   }
472:   /* shift so that largest real part is (about) 0 */
473:   PetscArraycpy(sMaux,Aa,n2);
474:   if (shift) {
475:     for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
476:     PetscLogFlops(1.0*n);
477:   }
478: #if defined(PETSC_USE_COMPLEX)
479:   PetscArraycpy(Maux,Aa,n2);
480:   if (shift) {
481:     for (i=0;i<n;i++) Maux[i+i*n] -= shift;
482:     PetscLogFlops(1.0*n);
483:   }
484: #endif

486:   /* estimate norm(A) and select the scaling factor */
487:   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
488:   PetscLogFlops(1.0*n*n);
489:   sexpm_params(nrm,&s,&k,&m);
490:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
491:     if (shift) expshift = PetscExpReal(shift);
492:     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
493:     if (shift) {
494:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
495:       PetscLogFlops(1.0*(n+n2));
496:     } else {
497:       PetscLogFlops(1.0*n);
498:     }
499:     PetscArraycpy(Ba,sMaux,n2);
500:     PetscFree2(sMaux,Maux);
501:     MatDenseRestoreArrayRead(A,&Aa);
502:     MatDenseRestoreArray(B,&Ba);
503:     return(0); /* quick return */
504:   }

506:   PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
507:   expmA2 = expmA; RR2 = RR;
508:   /* scale matrix */
509: #if !defined(PETSC_USE_COMPLEX)
510:   for (i=0;i<n2;i++) {
511:     As[i] = sMaux[i];
512:   }
513: #else
514:   PetscArraycpy(As,sMaux,n2);
515: #endif
516:   scale = 1.0/PetscPowRealInt(2.0,s);
517:   PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
518:   SlepcLogFlopsComplex(1.0*n2);

520:   /* evaluate Pade approximant (partial fraction or product form) */
521:   if (fn->method==3 || !m) { /* partial fraction */
522:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
523:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
524:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
525:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
526:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
527:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

529:     PetscArrayzero(expmA,n2);
530: #if !defined(PETSC_USE_COMPLEX)
531:     isreal = PETSC_TRUE;
532: #else
533:     getisreal(n2,Maux,&isreal);
534: #endif
535:     if (isreal) {
536:       rsizediv2 = irsize/2;
537:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
538:         PetscArraycpy(Maux,As,n2);
539:         PetscArrayzero(RR,n2);
540:         for (j=0;j<n;j++) {
541:           Maux[j+j*n] -= p[2*i];
542:           RR[j+j*n] = r[2*i];
543:         }
544:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
545:         SlepcCheckLapackInfo("gesv",info);
546:         for (j=0;j<n2;j++) {
547:           expmA[j] += RR[j] + PetscConj(RR[j]);
548:         }
549:         /* loop(n) + gesv + loop(n2) */
550:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
551:       }

553:       mod = ipsize % 2;
554:       if (mod) {
555:         PetscArraycpy(Maux,As,n2);
556:         PetscArrayzero(RR,n2);
557:         for (j=0;j<n;j++) {
558:           Maux[j+j*n] -= p[ipsize-1];
559:           RR[j+j*n] = r[irsize-1];
560:         }
561:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
562:         SlepcCheckLapackInfo("gesv",info);
563:         for (j=0;j<n2;j++) {
564:           expmA[j] += RR[j];
565:         }
566:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
567:       }
568:     } else { /* complex */
569:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
570:         PetscArraycpy(Maux,As,n2);
571:         PetscArrayzero(RR,n2);
572:         for (j=0;j<n;j++) {
573:           Maux[j+j*n] -= p[i];
574:           RR[j+j*n] = r[i];
575:         }
576:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
577:         SlepcCheckLapackInfo("gesv",info);
578:         for (j=0;j<n2;j++) {
579:           expmA[j] += RR[j];
580:         }
581:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
582:       }
583:     }
584:     for (i=0;i<iremainsize;i++) {
585:       if (!i) {
586:         PetscArrayzero(RR,n2);
587:         for (j=0;j<n;j++) {
588:           RR[j+j*n] = remainterm[iremainsize-1];
589:         }
590:       } else {
591:         PetscArraycpy(RR,As,n2);
592:         for (j=1;j<i;j++) {
593:           PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
594:           SWAP(RR,Maux,aux);
595:           SlepcLogFlopsComplex(2.0*n*n*n);
596:         }
597:         PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
598:         SlepcLogFlopsComplex(1.0*n2);
599:       }
600:       for (j=0;j<n2;j++) {
601:         expmA[j] += RR[j];
602:       }
603:       SlepcLogFlopsComplex(1.0*n2);
604:     }
605:     PetscFree3(r,p,remainterm);
606:   } else { /* product form, default */
607:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
608:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
609:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
610:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
611:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

613:     PetscArrayzero(expmA,n2);
614:     for (i=0;i<n;i++) { /* initialize */
615:       expmA[i+i*n] = 1.0;
616:     }
617:     minlen = PetscMin(irsize,ipsize);
618:     for (i=0;i<minlen;i++) {
619:       PetscArraycpy(RR,As,n2);
620:       for (j=0;j<n;j++) {
621:         RR[j+j*n] -= rootp[i];
622:       }
623:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
624:       SWAP(expmA,Maux,aux);
625:       PetscArraycpy(RR,As,n2);
626:       for (j=0;j<n;j++) {
627:         RR[j+j*n] -= rootq[i];
628:       }
629:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
630:       SlepcCheckLapackInfo("gesv",info);
631:       /* loop(n) + gemm + loop(n) + gesv */
632:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
633:     }
634:     /* extra numerator */
635:     for (i=minlen;i<irsize;i++) {
636:       PetscArraycpy(RR,As,n2);
637:       for (j=0;j<n;j++) {
638:         RR[j+j*n] -= rootp[i];
639:       }
640:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
641:       SWAP(expmA,Maux,aux);
642:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
643:     }
644:     /* extra denominator */
645:     for (i=minlen;i<ipsize;i++) {
646:       PetscArraycpy(RR,As,n2);
647:       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
648:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
649:       SlepcCheckLapackInfo("gesv",info);
650:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
651:     }
652:     PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
653:     SlepcLogFlopsComplex(1.0*n2);
654:     PetscFree2(rootp,rootq);
655:   }

657: #if !defined(PETSC_USE_COMPLEX)
658:   for (i=0;i<n2;i++) {
659:     Ba2[i] = PetscRealPartComplex(expmA[i]);
660:   }
661: #else
662:   PetscArraycpy(Ba2,expmA,n2);
663: #endif

665:   /* perform repeated squaring */
666:   for (i=0;i<s;i++) { /* final squaring */
667:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
668:     SWAP(Ba2,sMaux,saux);
669:     PetscLogFlops(2.0*n*n*n);
670:   }
671:   if (Ba2!=Ba) {
672:     PetscArraycpy(Ba,Ba2,n2);
673:     sMaux = Ba2;
674:   }
675:   if (shift) {
676:     expshift = PetscExpReal(shift);
677:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
678:     PetscLogFlops(1.0*n2);
679:   }

681:   /* restore pointers */
682:   Maux = Maux2; expmA = expmA2; RR = RR2;
683:   PetscFree2(sMaux,Maux);
684:   PetscFree4(expmA,As,RR,piv);
685:   MatDenseRestoreArrayRead(A,&Aa);
686:   MatDenseRestoreArray(B,&Ba);
687:   return(0);
688: #endif
689: }

691: #define SMALLN 100

693: /*
694:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
695:  */
696: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
697: {
698:   PetscScalar    *Ascaled=work;
699:   PetscReal      nrm,alpha,beta,rwork[1];
700:   PetscInt       t;
701:   PetscBLASInt   i,j;

705:   beta = PetscPowReal(coeff,1.0/(2*m+1));
706:   for (i=0;i<n;i++)
707:     for (j=0;j<n;j++)
708:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
709:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
710:   PetscLogFlops(2.0*n*n);
711:   SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
712:   alpha /= nrm;
713:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
714:   PetscFunctionReturn(t);
715: }

717: /*
718:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
719:  */
720: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
721: {
722:   PetscErrorCode  ierr;
723:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
724:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
725:   PetscBLASInt    n_=0,n2,one=1;
726:   PetscRandom     rand;
727:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
728:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
729:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
730:                                2.539398330063230e-001,    /* m = 5  */
731:                                9.504178996162932e-001,    /* m = 7  */
732:                                2.097847961257068e+000,    /* m = 9  */
733:                                5.371920351148152e+000 };  /* m = 13 */

736:   *s = 0;
737:   *m = 13;
738:   PetscBLASIntCast(n,&n_);
739:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
740:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
741:   if (d4==0.0) { /* safeguard for the case A = 0 */
742:     *m = 3;
743:     goto done;
744:   }
745:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
746:   PetscLogFlops(2.0*n*n);
747:   eta1 = PetscMax(d4,d6);
748:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
749:     *m = 3;
750:     goto done;
751:   }
752:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
753:     *m = 5;
754:     goto done;
755:   }
756:   if (n<SMALLN) {
757:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
758:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
759:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
760:   } else {
761:     SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
762:     d8 = PetscPowReal(d8,1.0/8.0);
763:   }
764:   eta3 = PetscMax(d6,d8);
765:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
766:     *m = 7;
767:     goto done;
768:   }
769:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
770:     *m = 9;
771:     goto done;
772:   }
773:   if (n<SMALLN) {
774:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
775:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
776:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
777:   } else {
778:     SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
779:     d10 = PetscPowReal(d10,1.0/10.0);
780:   }
781:   eta4 = PetscMax(d8,d10);
782:   eta5 = PetscMin(eta3,eta4);
783:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
784:   if (*s) {
785:     Ascaled = work+3*n*n;
786:     n2 = n_*n_;
787:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
788:     sfactor = PetscPowRealInt(2.0,-(*s));
789:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
790:     PetscLogFlops(1.0*n*n);
791:   } else Ascaled = A;
792:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
793: done:
794:   PetscRandomDestroy(&rand);
795:   return(0);
796: }

798: /*
799:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
800:  *
801:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
802:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
803:  */
804: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
805: {
806:   PetscErrorCode    ierr;
807:   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
808:   PetscInt          n,m,j,s;
809:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
810:   PetscScalar       *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
811:   const PetscScalar *Aa,*c;
812:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
813:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
814:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
815:   const PetscScalar c9[10]  = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
816:                                 2162160, 110880, 3960, 90, 1 };
817:   const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
818:                                 1187353796428800.0,  129060195264000.0,   10559470521600.0,
819:                                 670442572800.0,      33522128640.0,       1323241920.0,
820:                                 40840800,          960960,            16380,  182,  1 };

823:   MatDenseGetArrayRead(A,&Aa);
824:   MatDenseGetArray(B,&Ba);
825:   MatGetSize(A,&n,NULL);
826:   PetscBLASIntCast(n,&n_);
827:   n2 = n_*n_;
828:   PetscMalloc2(8*n*n,&work,n,&ipiv);

830:   /* Matrix powers */
831:   Apowers[0] = work;                  /* Apowers[0] = A   */
832:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
833:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
834:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
835:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

837:   PetscArraycpy(Apowers[0],Aa,n2);
838:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
839:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
840:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
841:   PetscLogFlops(6.0*n*n*n);

843:   /* Compute scaling parameter and order of Pade approximant */
844:   expm_params(n,Apowers,&s,&m,Apowers[4]);

846:   if (s) { /* rescale */
847:     for (j=0;j<4;j++) {
848:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
849:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
850:     }
851:     PetscLogFlops(4.0*n*n);
852:   }

854:   /* Evaluate the Pade approximant */
855:   switch (m) {
856:     case 3:  c = c3;  break;
857:     case 5:  c = c5;  break;
858:     case 7:  c = c7;  break;
859:     case 9:  c = c9;  break;
860:     case 13: c = c13; break;
861:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
862:   }
863:   P = Ba;
864:   Q = Apowers[4] + n*n;
865:   W = Q + n*n;
866:   switch (m) {
867:     case 3:
868:     case 5:
869:     case 7:
870:     case 9:
871:       if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
872:       PetscArrayzero(P,n2);
873:       PetscArrayzero(Q,n2);
874:       for (j=0;j<n;j++) {
875:         P[j+j*n] = c[1];
876:         Q[j+j*n] = c[0];
877:       }
878:       for (j=m;j>=3;j-=2) {
879:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
880:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
881:         PetscLogFlops(4.0*n*n);
882:       }
883:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
884:       PetscLogFlops(2.0*n*n*n);
885:       SWAP(P,W,aux);
886:       break;
887:     case 13:
888:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
889:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
890:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
891:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
892:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
893:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
894:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
895:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
896:       PetscArrayzero(P,n2);
897:       for (j=0;j<n;j++) P[j+j*n] = c[1];
898:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
899:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
900:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
901:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
902:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
903:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
904:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
905:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
906:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
907:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
908:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
909:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
910:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
911:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
912:       PetscArrayzero(Q,n2);
913:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
914:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
915:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
916:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
917:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
918:       PetscLogFlops(7.0*n*n);
919:       break;
920:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
921:   }
922:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
923:   PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
924:   SlepcCheckLapackInfo("gesv",info);
925:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
926:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
927:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

929:   /* Squaring */
930:   for (j=1;j<=s;j++) {
931:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
932:     SWAP(P,W,aux);
933:   }
934:   if (P!=Ba) { PetscArraycpy(Ba,P,n2); }
935:   PetscLogFlops(2.0*n*n*n*s);

937:   PetscFree2(work,ipiv);
938:   MatDenseRestoreArrayRead(A,&Aa);
939:   MatDenseRestoreArray(B,&Ba);
940:   return(0);
941: }

943: #if defined(PETSC_HAVE_CUDA)
944: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
945: #include <slepccublas.h>

947: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
948: {
950:   PetscBLASInt   n=0,ld,ld2,*d_ipiv,*d_info,info,one=1,zero=0;
951:   PetscInt       m,k,sexp;
952:   PetscBool      odd;
953:   const PetscInt p=MAX_PADE;
954:   PetscReal      c[MAX_PADE+1],s;
955:   PetscScalar    scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
956:   PetscScalar    *Aa,*Ba;
957:   PetscScalar    *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
958:   cublasHandle_t cublasv2handle;
959:   cublasStatus_t cberr;
960:   cudaError_t    cerr;

963:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
964:   PetscCUBLASGetHandle(&cublasv2handle);
965:   MatDenseGetArray(A,&Aa);
966:   MatDenseGetArray(B,&Ba);
967:   MatGetSize(A,&m,NULL);
968:   PetscBLASIntCast(m,&n);
969:   ld  = n;
970:   ld2 = ld*ld;

972:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
973:   cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
974:   cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
975:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
976:   cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
977:   cerr = cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld);CHKERRCUDA(cerr);
978:   cerr = cudaMalloc((void **)&d_info,sizeof(PetscBLASInt));CHKERRCUDA(cerr);
979:   cerr = cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*));CHKERRCUDA(cerr);
980:   cerr = cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*));CHKERRCUDA(cerr);

982:   PetscMalloc1(1,&ppP);
983:   PetscMalloc1(1,&ppQ);

985:   cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
986:   d_P = d_Ba;
987:   PetscLogGpuTimeBegin();

989:   /* Pade' coefficients */
990:   c[0] = 1.0;
991:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

993:   /* Scaling */
994:   cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
995:   if (s>0.5) {
996:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
997:     scale = PetscPowRealInt(2.0,-sexp);
998:     cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
999:     PetscLogGpuFlops(1.0*n*n);
1000:   } else sexp = 0;

1002:   /* Horner evaluation */
1003:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1004:   PetscLogGpuFlops(2.0*n*n*n);
1005:   cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1006:   cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1007:   set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1008:   set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);

1010:   odd = PETSC_TRUE;
1011:   for (k=p-1;k>0;k--) {
1012:     if (odd) {
1013:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1014:       SWAP(d_Q,d_W,aux);
1015:       shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1016:       odd = PETSC_FALSE;
1017:     } else {
1018:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1019:       SWAP(d_P,d_W,aux);
1020:       shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1021:       odd = PETSC_TRUE;
1022:     }
1023:     PetscLogGpuFlops(2.0*n*n*n);
1024:   }
1025:   if (odd) {
1026:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1027:     SWAP(d_Q,d_W,aux);
1028:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1030:     ppQ[0] = d_Q;
1031:     ppP[0] = d_P;
1032:     cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1033:     cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

1035:     cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1036:     cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1037:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1038:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1039: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1040:     cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1041:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1042:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1043: #else
1044:     SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1045: #endif
1046:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1047:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1048:     cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1049:   } else {
1050:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1051:     SWAP(d_P,d_W,aux);
1052:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1054:     ppQ[0] = d_Q;
1055:     ppP[0] = d_P;
1056:     cerr = cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1057:     cerr = cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

1059:     cberr = cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);CHKERRCUBLAS(cberr);
1060:     cerr = cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1061:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
1062:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
1063: #if defined (CUDA_VERSION) && CUDA_VERSION >= 5050
1064:     cberr = cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);CHKERRCUBLAS(cberr);
1065:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
1066:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
1067: #else
1068:     SETERRQ(communicator,PETSC_ERR_LIB,"cublasXgetrsBatched needs CUDA >= 7");
1069: #endif
1070:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1071:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1072:   }
1073:   PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1075:   for (k=1;k<=sexp;k++) {
1076:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1077:     cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1078:   }
1079:   if (d_P!=d_Ba) {
1080:     cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1081:   } else {
1082:     cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1083:   }
1084:   PetscLogGpuFlops(2.0*n*n*n*sexp);

1086:   PetscLogGpuTimeEnd();
1087:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1088:   cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1089:   cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1090:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1091:   cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1092:   cerr = cudaFree(d_ipiv);CHKERRCUDA(cerr);
1093:   cerr = cudaFree(d_info);CHKERRCUDA(cerr);
1094:   cerr = cudaFree(d_ppP);CHKERRCUDA(cerr);
1095:   cerr = cudaFree(d_ppQ);CHKERRCUDA(cerr);

1097:   PetscFree(ppP);
1098:   PetscFree(ppQ);

1100:   MatDenseRestoreArray(A,&Aa);
1101:   MatDenseRestoreArray(B,&Ba);
1102:   return(0);
1103: }

1105: #if defined(PETSC_HAVE_MAGMA)
1106: #include <slepcmagma.h>

1108: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1109: {
1111:   PetscBLASInt   n=0,ld,ld2,*piv,info,one=1,zero=0;
1112:   PetscInt       m,k,sexp;
1113:   PetscBool      odd;
1114:   const PetscInt p=MAX_PADE;
1115:   PetscReal      c[MAX_PADE+1],s;
1116:   PetscScalar    scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1117:   PetscScalar    *Aa,*Ba;
1118:   PetscScalar    *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1119:   cublasHandle_t cublasv2handle;
1120:   cublasStatus_t cberr;
1121:   cudaError_t    cerr;
1122:   magma_int_t    mierr;

1125:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
1126:   PetscCUBLASGetHandle(&cublasv2handle);
1127:   magma_init();
1128:   MatDenseGetArray(A,&Aa);
1129:   MatDenseGetArray(B,&Ba);
1130:   MatGetSize(A,&m,NULL);
1131:   PetscBLASIntCast(m,&n);
1132:   ld  = n;
1133:   ld2 = ld*ld;

1135:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1136:   cerr = cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1137:   cerr = cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1138:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);
1139:   cerr = cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);CHKERRCUDA(cerr);

1141:   PetscMalloc1(n,&piv);

1143:   cerr = cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1144:   d_P = d_Ba;
1145:   PetscLogGpuTimeBegin();

1147:   /* Pade' coefficients */
1148:   c[0] = 1.0;
1149:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

1151:   /* Scaling */
1152:   cberr = cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);CHKERRCUBLAS(cberr);
1153:   PetscLogGpuFlops(1.0*n*n);

1155:   if (s>0.5) {
1156:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1157:     scale = PetscPowRealInt(2.0,-sexp);
1158:     cberr = cublasXscal(cublasv2handle,ld2,&scale,d_As,one);CHKERRCUBLAS(cberr);
1159:     PetscLogGpuFlops(1.0*n*n);
1160:   } else sexp = 0;

1162:   /* Horner evaluation */
1163:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);CHKERRCUBLAS(cberr);
1164:   PetscLogGpuFlops(2.0*n*n*n);
1165:   cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1166:   cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*ld2);CHKERRCUDA(cerr);
1167:   set_diagonal(n,d_Q,ld,c[p]);CHKERRQ(cerr);
1168:   set_diagonal(n,d_P,ld,c[p-1]);CHKERRQ(cerr);

1170:   odd = PETSC_TRUE;
1171:   for (k=p-1;k>0;k--) {
1172:     if (odd) {
1173:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1174:       SWAP(d_Q,d_W,aux);
1175:       shift_diagonal(n,d_Q,ld,c[k-1]);CHKERRQ(cerr);
1176:       odd = PETSC_FALSE;
1177:     } else {
1178:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1179:       SWAP(d_P,d_W,aux);
1180:       shift_diagonal(n,d_P,ld,c[k-1]);CHKERRQ(cerr);
1181:       odd = PETSC_TRUE;
1182:     }
1183:     PetscLogGpuFlops(2.0*n*n*n);
1184:   }
1185:   if (odd) {
1186:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1187:     SWAP(d_Q,d_W,aux);
1188:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1189:     mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1190:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1191:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1192:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1193:     cberr = cublasXscal(cublasv2handle,ld2,&smone,d_P,one);CHKERRCUBLAS(cberr);
1194:   } else {
1195:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1196:     SWAP(d_P,d_W,aux);
1197:     cberr = cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);
1198:     mmagma_xgesv_gpu(n,n,d_Q,ld,piv,d_P,ld,&info);CHKERRMAGMA(mierr);
1199:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);
1200:     cberr = cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1201:     shift_diagonal(n,d_P,ld,sone);CHKERRQ(cerr);
1202:   }
1203:   PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1205:   for (k=1;k<=sexp;k++) {
1206:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);CHKERRCUBLAS(cberr);
1207:     cerr = cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1208:   }
1209:   if (d_P!=d_Ba) {
1210:     cerr = cudaMemcpy(Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1211:   } else {
1212:     cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1213:   }
1214:   PetscLogGpuFlops(2.0*n*n*n*sexp);

1216:   PetscLogGpuTimeEnd();
1217:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1218:   cerr = cudaFree(d_Q);CHKERRCUDA(cerr);
1219:   cerr = cudaFree(d_W);CHKERRCUDA(cerr);
1220:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1221:   cerr = cudaFree(d_A2);CHKERRCUDA(cerr);
1222:   PetscFree(piv);

1224:   MatDenseRestoreArray(A,&Aa);
1225:   MatDenseRestoreArray(B,&Ba);
1226:   magma_finalize();
1227:   return(0);
1228: }

1230: /*
1231:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1232:  *
1233:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
1234:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1235:  */
1236: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1237: {
1238:   PetscErrorCode    ierr;
1239:   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
1240:   PetscInt          n,m,j,s,zero=0;
1241:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1242:   PetscScalar       *Aa,*Ba,*d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1243:   const PetscScalar *c;
1244:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
1245:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
1246:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1247:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1248:     2162160, 110880, 3960, 90, 1 };
1249:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1250:     1187353796428800,  129060195264000,   10559470521600,
1251:     670442572800,      33522128640,       1323241920,
1252:     40840800,          960960,            16380,  182,  1 };
1253:   cublasHandle_t    cublasv2handle;
1254:   cublasStatus_t    cberr;
1255:   cudaError_t       cerr;
1256:   magma_int_t       mierr;

1259:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
1260:   PetscCUBLASGetHandle(&cublasv2handle);
1261:   magma_init();
1262:   MatDenseGetArray(A,&Aa);
1263:   MatDenseGetArray(B,&Ba);
1264:   MatGetSize(A,&n,NULL);
1265:   PetscBLASIntCast(n,&n_);
1266:   n2 = n_*n_;
1267:   PetscMalloc2(8*n*n,&work,n,&ipiv);
1268:   cerr = cudaMalloc((void**)&d_work,8*n*n*sizeof(PetscScalar));CHKERRCUDA(ierr);
1269:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n*n);CHKERRCUDA(cerr);

1271:   PetscLogGpuTimeBegin();

1273:   /* Matrix powers */
1274:   Apowers[0] = work;                  /* Apowers[0] = A   */
1275:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
1276:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
1277:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
1278:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
1279:   /* Matrix powers on device */
1280:   d_Apowers[0] = d_work;                /* d_Apowers[0] = A   */
1281:   d_Apowers[1] = d_Apowers[0] + n*n;    /* d_Apowers[1] = A^2 */
1282:   d_Apowers[2] = d_Apowers[1] + n*n;    /* d_Apowers[2] = A^4 */
1283:   d_Apowers[3] = d_Apowers[2] + n*n;    /* d_Apowers[3] = A^6 */
1284:   d_Apowers[4] = d_Apowers[3] + n*n;    /* d_Apowers[4] = A^8 */

1286:   cerr = cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(ierr);
1287:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_);CHKERRCUBLAS(cberr);
1288:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_);CHKERRCUBLAS(cberr);
1289:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_);CHKERRCUBLAS(cberr);
1290:   PetscLogGpuFlops(6.0*n*n*n);

1292:   cerr = cudaMemcpy(Apowers[0],d_Apowers[0],4*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(ierr);
1293:   /* Compute scaling parameter and order of Pade approximant */
1294:   expm_params(n,Apowers,&s,&m,Apowers[4]);

1296:   if (s) { /* rescale */
1297:     for (j=0;j<4;j++) {
1298:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1299:       cberr = cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one);CHKERRCUBLAS(cberr);
1300:     }
1301:     PetscLogGpuFlops(4.0*n*n);
1302:   }

1304:   /* Evaluate the Pade approximant */
1305:   switch (m) {
1306:     case 3:  c = c3;  break;
1307:     case 5:  c = c5;  break;
1308:     case 7:  c = c7;  break;
1309:     case 9:  c = c9;  break;
1310:     case 13: c = c13; break;
1311:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1312:   }
1313:   d_P = d_Ba;
1314:   d_Q = d_Apowers[4] + n*n;
1315:   d_W = d_Q + n*n;
1316:   switch (m) {
1317:     case 3:
1318:     case 5:
1319:     case 7:
1320:     case 9:
1321:       if (m==9) {cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_);CHKERRCUBLAS(cberr);}
1322:       cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1323:       cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1324:       set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1325:       set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1326:       for (j=m;j>=3;j-=2) {
1327:         cberr = cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one);CHKERRCUBLAS(cberr);
1328:         cberr = cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one);CHKERRCUBLAS(cberr);
1329:         PetscLogGpuFlops(4.0*n*n);
1330:       }
1331:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1332:       PetscLogGpuFlops(2.0*n*n*n);
1333:       SWAP(d_P,d_W,aux);
1334:       break;
1335:     case 13:
1336:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1337:           + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
1338:       cerr = cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1339:       cberr = cublasXscal(cublasv2handle,n2,&c[13],d_P,one);CHKERRCUBLAS(cberr);
1340:       cberr = cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1341:       cberr = cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1342:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1343:       PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);

1345:       cerr = cudaMemset(d_P,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1346:       set_diagonal(n,d_P,n,c[1]);CHKERRQ(cerr);
1347:       cberr = cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one);CHKERRCUBLAS(cberr);
1348:       cberr = cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one);CHKERRCUBLAS(cberr);
1349:       cberr = cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one);CHKERRCUBLAS(cberr);
1350:       cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one);CHKERRCUBLAS(cberr);
1351:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_);CHKERRCUBLAS(cberr);
1352:       PetscLogGpuFlops(7.0*n*n+2.0*n*n*n);
1353:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1354:           + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
1355:       cerr = cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1356:       cberr = cublasXscal(cublasv2handle,n2,&c[12],d_Q,one);CHKERRCUBLAS(cberr);
1357:       cberr = cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1358:       cberr = cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1359:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1360:       PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);
1361:       cerr = cudaMemset(d_Q,zero,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1362:       set_diagonal(n,d_Q,n,c[0]);CHKERRQ(cerr);
1363:       cberr = cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one);CHKERRCUBLAS(cberr);
1364:       cberr = cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one);CHKERRCUBLAS(cberr);
1365:       cberr = cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one);CHKERRCUBLAS(cberr);
1366:       cberr = cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one);CHKERRCUBLAS(cberr);
1367:       PetscLogGpuFlops(7.0*n*n);
1368:       break;
1369:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1370:   }
1371:   cberr = cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one);CHKERRCUBLAS(cberr);

1373:   mmagma_xgesv_gpu(n_,n_,d_Q,n_,ipiv,d_P,n_,&info);CHKERRMAGMA(mierr);
1374:   if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGESV %d",info);

1376:   cberr = cublasXscal(cublasv2handle,n2,&stwo,d_P,one);CHKERRCUBLAS(cberr);
1377:   shift_diagonal(n,d_P,n,sone);
1378:   PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n);

1380:   /* Squaring */
1381:   for (j=1;j<=s;j++) {
1382:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_);CHKERRCUBLAS(cberr);
1383:     SWAP(d_P,d_W,aux);
1384:   }
1385:   if (d_P!=d_Ba) {
1386:     cerr = cudaMemcpy(Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1387:   } else {
1388:     cerr = cudaMemcpy(Ba,d_Ba,n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1389:   }
1390:   PetscLogGpuFlops(2.0*n*n*n*s);
1391:   PetscLogGpuTimeEnd();

1393:   PetscFree2(work,ipiv);
1394:   MatDenseRestoreArray(A,&Aa);
1395:   MatDenseRestoreArray(B,&Ba);
1396:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1397:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
1398:   magma_finalize();
1399:   return(0);
1400: }

1402: /*
1403:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1404:  * and Yuji Nakatsukasa
1405:  *
1406:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1407:  *     Approximation for the Matrix Exponential",
1408:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1409:  *     https://doi.org/10.1137/15M1027553
1410:  */
1411: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1412: {
1413:   PetscInt       i,j,n_,s,k,m,zero=0,mod;
1414:   PetscBLASInt   n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,info,*piv,minlen,lwork=0,one=1;
1415:   PetscReal      nrm,shift=0.0,rone=1.0,rzero=0.0;
1416: #if defined(PETSC_USE_COMPLEX)
1417:   PetscReal      *rwork=NULL;
1418: #endif
1419:   PetscComplex   *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1420:   PetscScalar    *Aa,*Ba,*d_Ba,*d_Ba2,*Maux,*sMaux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1422:   PetscBool      isreal,*d_isreal,flg;
1423:   cublasHandle_t cublasv2handle;
1424:   cudaError_t    cerr;
1425:   cublasStatus_t cberr;
1426:   magma_int_t    mierr;

1429:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
1430:   PetscCUBLASGetHandle(&cublasv2handle);
1431:   magma_init();
1432:   MatGetSize(A,&n_,NULL);
1433:   PetscBLASIntCast(n_,&n);
1434:   MatDenseGetArray(A,&Aa);
1435:   MatDenseGetArray(B,&Ba);
1436:   PetscBLASIntCast(n*n,&n2);

1438:   cerr = cudaMalloc((void **)&d_Ba,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1439:   PetscLogGpuTimeBegin();
1440:   d_Ba2 = d_Ba;

1442:   PetscMalloc2(n2,&sMaux,n2,&Maux);
1443:   cerr = cudaMalloc((void **)&d_isreal,sizeof(PetscBool));CHKERRCUDA(cerr);
1444:   cerr = cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2);CHKERRCUDA(cerr);
1445:   cerr = cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);

1447:   cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1448:   d_Maux2 = d_Maux;
1449:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
1450:   if (!flg) {
1451:     PetscMalloc2(n,&wr,n,&wi);
1452:     PetscArraycpy(sMaux,Aa,n2);
1453:     /* estimate rightmost eigenvalue and shift A with it */
1454: #if !defined(PETSC_USE_COMPLEX)
1455:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,&work1,query,&info);CHKERRMAGMA(mierr);
1456:     SlepcCheckLapackInfo("geev",info);
1457:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1458:     PetscMalloc1(lwork,&work);
1459:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,sMaux,n,wr,wi,NULL,n,NULL,n,work,lwork,&info);CHKERRMAGMA(mierr);
1460:     PetscFree(work);
1461: #else
1462:     PetscArraycpy(Maux,Aa,n2);
1463:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork,&info);CHKERRMAGMA(mierr);
1464:     SlepcCheckLapackInfo("geev",info);
1465:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1466:     PetscMalloc2(2*n,&rwork,lwork,&work);
1467:     mmagma_xgeev(MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork,&info);CHKERRMAGMA(mierr);
1468:     PetscFree2(rwork,work);
1469: #endif
1470:     SlepcCheckLapackInfo("geev",info);
1471:     PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

1473:     shift = PetscRealPart(wr[0]);
1474:     for (i=1;i<n;i++) {
1475:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1476:     }
1477:     PetscFree2(wr,wi);
1478:   }
1479:   /* shift so that largest real part is (about) 0 */
1480:   cerr = cudaMemcpy(d_sMaux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1481:   if (shift) {
1482:     shift_diagonal(n,d_sMaux,n,-shift);
1483:     PetscLogGpuFlops(1.0*n);
1484:   }
1485: #if defined(PETSC_USE_COMPLEX)
1486:   cerr = cudaMemcpy(d_Maux,Aa,sizeof(PetscScalar)*n2,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1487:   if (shift) {
1488:     shift_diagonal(n,d_Maux,n,-shift);
1489:     PetscLogGpuFlops(1.0*n);
1490:   }
1491: #endif

1493:   /* estimate norm(A) and select the scaling factor */
1494:   cberr = cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm);CHKERRCUBLAS(cberr);
1495:   PetscLogGpuFlops(2.0*n*n);
1496:   sexpm_params(nrm,&s,&k,&m);
1497:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1498:     if (shift) expshift = PetscExpReal(shift);
1499:     shift_Cdiagonal(n,d_Maux,n,rone,rzero);
1500:     if (shift) {
1501:       cberr = cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one);CHKERRCUBLAS(cberr);
1502:       PetscLogGpuFlops(1.0*(n+n2));
1503:     } else {
1504:       PetscLogGpuFlops(1.0*n);
1505:     }
1506:     cerr = cudaMemcpy(Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1507:     cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1508:     cerr = cudaFree(d_isreal);CHKERRCUDA(cerr);
1509:     cerr = cudaFree(d_sMaux);CHKERRCUDA(cerr);
1510:     cerr = cudaFree(d_Maux);CHKERRCUDA(cerr);
1511:     MatDenseRestoreArray(A,&Aa);
1512:     MatDenseRestoreArray(B,&Ba);
1513:     return(0); /* quick return */
1514:   }

1516:   cerr = cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1517:   cerr = cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1518:   cerr = cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1519:   d_expmA2 = d_expmA; d_RR2 = d_RR;
1520:   PetscMalloc1(n,&piv);
1521:   /* scale matrix */
1522: #if !defined(PETSC_USE_COMPLEX)
1523:   copy_array2D_S2C(n,n,d_As,n,d_sMaux,n);
1524: #else
1525:   cerr = cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1526: #endif
1527:   scale = 1.0/PetscPowRealInt(2.0,s);
1528:   cberr = cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one);CHKERRCUBLAS(cberr);
1529:   SlepcLogGpuFlopsComplex(1.0*n2);

1531:   /* evaluate Pade approximant (partial fraction or product form) */
1532:   if (fn->method==8 || !m) { /* partial fraction */
1533:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
1534:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1535:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1536:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
1537:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
1538:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

1540:     cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1541: #if !defined(PETSC_USE_COMPLEX)
1542:     isreal = PETSC_TRUE;
1543: #else
1544:     getisreal_array2D(n,n,d_Maux,n,d_isreal);
1545:     cerr = cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1546: #endif
1547:     if (isreal) {
1548:       rsizediv2 = irsize/2;
1549:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1550:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1551:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1552:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i]));
1553:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i]));
1554:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1555:         SlepcCheckLapackInfo("gesv",info);
1556:         add_array2D_Conj(n,n,d_RR,n);
1557:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1558:         /* shift(n) + gesv + axpy(n2) */
1559:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
1560:       }

1562:       mod = ipsize % 2;
1563:       if (mod) {
1564:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1565:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1566:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1]));
1567:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1]));
1568:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1569:         SlepcCheckLapackInfo("gesv",info);
1570:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1571:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1572:       }
1573:     } else { /* complex */
1574:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1575:         cerr = cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1576:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1577:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i]));
1578:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i]));
1579:         mmagma_Cgesv_gpu(n,n,d_Maux,n,piv,d_RR,n,&info);CHKERRMAGMA(mierr);
1580:         SlepcCheckLapackInfo("gesv",info);
1581:         cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1582:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1583:       }
1584:     }
1585:     for (i=0;i<iremainsize;i++) {
1586:       if (!i) {
1587:         cerr = cudaMemset(d_RR,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1588:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1]));
1589:       } else {
1590:         cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1591:         for (j=1;j<i;j++) {
1592:           cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1593:           SWAP(d_RR,d_Maux,aux);
1594:           SlepcLogGpuFlopsComplex(2.0*n*n*n);
1595:         }
1596:         cberr = cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one);CHKERRCUBLAS(cberr);
1597:         SlepcLogGpuFlopsComplex(1.0*n2);
1598:       }
1599:       cberr = cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);CHKERRCUBLAS(cberr);
1600:       SlepcLogGpuFlopsComplex(1.0*n2);
1601:     }
1602:     PetscFree3(r,p,remainterm);
1603:   } else { /* product form, default */
1604:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
1605:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1606:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1607:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
1608:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

1610:     cerr = cudaMemset(d_expmA,zero,sizeof(PetscComplex)*n2);CHKERRCUDA(cerr);
1611:     set_Cdiagonal(n,d_expmA,n,rone,rzero); /* initialize */
1612:     minlen = PetscMin(irsize,ipsize);
1613:     for (i=0;i<minlen;i++) {
1614:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1615:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1616:       cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1617:       SWAP(d_expmA,d_Maux,aux);
1618:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1619:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1620:       mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1621:       SlepcCheckLapackInfo("gesv",info);
1622:       /* shift(n) + gemm + shift(n) + gesv */
1623:       SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1624:     }
1625:     /* extra enumerator */
1626:     for (i=minlen;i<irsize;i++) {
1627:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1628:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1629:       cberr = cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);CHKERRCUBLAS(cberr);
1630:       SWAP(d_expmA,d_Maux,aux);
1631:       SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n);
1632:     }
1633:     /* extra denominator */
1634:     for (i=minlen;i<ipsize;i++) {
1635:       cerr = cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1636:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1637:       mmagma_Cgesv_gpu(n,n,d_RR,n,piv,d_expmA,n,&info);CHKERRMAGMA(mierr);
1638:       SlepcCheckLapackInfo("gesv",info);
1639:       SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1640:     }
1641:     cberr = cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one);CHKERRCUBLAS(cberr);
1642:     SlepcLogGpuFlopsComplex(1.0*n2);
1643:     PetscFree2(rootp,rootq);
1644:   }

1646: #if !defined(PETSC_USE_COMPLEX)
1647:   copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n);
1648: #else
1649:   cerr = cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1650: #endif

1652:   /* perform repeated squaring */
1653:   for (i=0;i<s;i++) { /* final squaring */
1654:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n);CHKERRCUBLAS(cberr);
1655:     SWAP(d_Ba2,d_sMaux,saux);
1656:     PetscLogGpuFlops(2.0*n*n*n);
1657:   }
1658:   if (d_Ba2!=d_Ba) {
1659:     cerr = cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
1660:     d_sMaux = d_Ba2;
1661:   }
1662:   if (shift) {
1663:     expshift = PetscExpReal(shift);
1664:     cberr = cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one);CHKERRCUBLAS(cberr);
1665:     PetscLogGpuFlops(1.0*n2);
1666:   }

1668:   cerr = cudaMemcpy(Ba,d_Ba,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1669:   PetscLogGpuTimeEnd();

1671:   /* restore pointers */
1672:   d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1673:   cerr = cudaFree(d_Ba);CHKERRCUDA(cerr);
1674:   cerr = cudaFree(d_isreal);CHKERRCUDA(cerr);
1675:   cerr = cudaFree(d_sMaux);CHKERRCUDA(cerr);
1676:   cerr = cudaFree(d_Maux);CHKERRCUDA(cerr);
1677:   cerr = cudaFree(d_expmA);CHKERRCUDA(cerr);
1678:   cerr = cudaFree(d_As);CHKERRCUDA(cerr);
1679:   cerr = cudaFree(d_RR);CHKERRCUDA(cerr);
1680:   PetscFree(piv);
1681:   PetscFree2(sMaux,Maux);
1682:   MatDenseRestoreArray(A,&Aa);
1683:   MatDenseRestoreArray(B,&Ba);
1684:   magma_finalize();
1685:   return(0);
1686: }
1687: #endif /* PETSC_HAVE_MAGMA */
1688: #endif /* PETSC_HAVE_CUDA */

1690: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1691: {
1693:   PetscBool      isascii;
1694:   char           str[50];
1695:   const char     *methodname[] = {
1696:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
1697:                   "scaling & squaring, [6/6] Pade approximant",
1698:                   "scaling & squaring, subdiagonal Pade approximant (product form)",
1699:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1700: #if defined(PETSC_HAVE_CUDA)
1701:                  ,"scaling & squaring, [6/6] Pade approximant CUDA"
1702: #if defined(PETSC_HAVE_MAGMA)
1703:                  ,"scaling & squaring, [m/m] Pade approximant (Higham) CUDA/MAGMA",
1704:                   "scaling & squaring, [6/6] Pade approximant CUDA/MAGMA",
1705:                   "scaling & squaring, subdiagonal Pade approximant (product form) CUDA/MAGMA",
1706:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction) CUDA/MAGMA",
1707: #endif
1708: #endif
1709:   };
1710:   const int      nmeth=sizeof(methodname)/sizeof(methodname[0]);

1713:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1714:   if (isascii) {
1715:     if (fn->beta==(PetscScalar)1.0) {
1716:       if (fn->alpha==(PetscScalar)1.0) {
1717:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(x)\n");
1718:       } else {
1719:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1720:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(%s*x)\n",str);
1721:       }
1722:     } else {
1723:       SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
1724:       if (fn->alpha==(PetscScalar)1.0) {
1725:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s*exp(x)\n",str);
1726:       } else {
1727:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s",str);
1728:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1729:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1730:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1731:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1732:       }
1733:     }
1734:     if (fn->method<nmeth) {
1735:       PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
1736:     }
1737:   }
1738:   return(0);
1739: }

1741: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1742: {
1744:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
1745:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
1746:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1747:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1748:   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1749:   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1750: #if defined(PETSC_HAVE_CUDA)
1751:   fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1752: #if defined(PETSC_HAVE_MAGMA)
1753:   fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1754:   fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1755:   fn->ops->evaluatefunctionmat[7] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1756:   fn->ops->evaluatefunctionmat[8] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1757: #endif
1758: #endif
1759:   fn->ops->view                   = FNView_Exp;
1760:   return(0);
1761: }