Fast Auxiliary Space Preconditioning  1.8.4 Feb/15/2016
blas_str.c
Go to the documentation of this file.
1 
6 #include <math.h>
7 
8 #ifdef _OPENMP
9 #include <omp.h>
10 #endif
11 
12 #include "fasp.h"
13 #include "fasp_functs.h"
14 
15 static inline void smat_amxv_nc3(REAL alpha, REAL *a, REAL *b, REAL *c);
16 static inline void smat_amxv_nc5(REAL alpha, REAL *a, REAL *b, REAL *c);
17 static inline void smat_amxv(REAL alpha, REAL *a, REAL *b,INT n, REAL *c);
18 static inline void blkcontr_str(INT start_data, INT start_vecx, INT start_vecy, INT nc,
19  REAL *data, REAL *x, REAL *y);
20 static inline void spaaxpy_str_2D_scalar(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
21 static inline void spaaxpy_str_2D_nc3(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
22 static inline void spaaxpy_str_2D_nc5(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
23 static inline void spaaxpy_str_2D_block(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
24 static inline void spaaxpy_str_3D_scalar(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
25 static inline void spaaxpy_str_3D_nc3(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
26 static inline void spaaxpy_str_3D_nc5(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
27 static inline void spaaxpy_str_3D_block(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
28 static inline void spaaxpy_str_general(REAL alpha, dSTRmat *A, REAL *x, REAL *y);
29 
30 /*---------------------------------*/
31 /*-- Public Functions --*/
32 /*---------------------------------*/
33 
47 void fasp_blas_dstr_aAxpy (const REAL alpha,
48  dSTRmat *A,
49  REAL *x,
50  REAL *y)
51 {
52 
53  switch (A->nband) {
54 
55  case 4:
56 
57  switch (A->nc) {
58  case 1:
59  spaaxpy_str_2D_scalar(alpha, A, x, y);
60  break;
61 
62  case 3:
63  spaaxpy_str_2D_nc3(alpha, A, x, y);
64  break;
65 
66  case 5:
67  spaaxpy_str_2D_nc5(alpha, A, x, y);
68  break;
69 
70  default:
71  spaaxpy_str_2D_block(alpha, A, x, y);
72  break;
73  }
74 
75  break;
76 
77  case 6:
78 
79  switch (A->nc) {
80  case 1:
81  spaaxpy_str_3D_scalar(alpha, A, x, y);
82  break;
83 
84  case 3:
85  spaaxpy_str_3D_nc3(alpha, A, x, y);
86  break;
87 
88  case 5:
89  spaaxpy_str_3D_nc5(alpha, A, x, y);
90  break;
91 
92  default:
93  spaaxpy_str_3D_block(alpha, A, x, y);
94  break;
95  }
96  break;
97 
98  default:
99  spaaxpy_str_general(alpha, A, x, y);
100  break;
101  }
102 
103 }
104 
118  REAL *x,
119  REAL *y)
120 {
121  int n = (A->ngrid)*(A->nc)*(A->nc);
122 
123  memset(y, 0, n*sizeof(REAL));
124 
125  fasp_blas_dstr_aAxpy(1.0, A, x, y);
126 }
127 
143  dSTRmat *B)
144 {
145  INT ngrid=A->ngrid, nc=A->nc,nband=A->nband;
146  INT nc2=nc*nc, size=ngrid*nc2;
147  INT i,j,ic2,nb,nb1;
148 
149 #ifdef _OPENMP
150  //variables for OpenMP
151  INT myid, mybegin, myend;
152  INT nthreads = FASP_GET_NUM_THREADS();
153 #endif
154 
155  REAL *diag=(REAL *)fasp_mem_calloc(size,sizeof(REAL));
156 
157  fasp_array_cp(size,A->diag,diag);
158 
159  fasp_dstr_alloc(A->nx, A->ny, A->nz,A->nxy,ngrid, nband,nc,A->offsets, B);
160 
161  //compute diagnal elements of B
162 #ifdef _OPENMP
163  if (ngrid > OPENMP_HOLDS) {
164 #pragma omp parallel for private(myid, mybegin, myend, i, ic2, j)
165  for (myid=0; myid<nthreads; myid++) {
166  FASP_GET_START_END(myid, nthreads, ngrid, &mybegin, &myend);
167  for (i=mybegin; i<myend; i++) {
168  ic2=i*nc2;
169  for (j=0; j<nc2; j++) {
170  if (j/nc == j%nc) B->diag[ic2+j]=1;
171  else B->diag[ic2+j]=0;
172  }
173  }
174  }
175  }
176  else {
177 #endif
178  for (i=0;i<ngrid;++i) {
179  ic2=i*nc2;
180  for (j=0;j<nc2;++j) {
181  if (j/nc == j%nc) B->diag[ic2+j]=1;
182  else B->diag[ic2+j]=0;
183  }
184  }
185 #ifdef _OPENMP
186  }
187 #endif
188 
189  for (i=0;i<ngrid;++i) fasp_blas_smat_inv(&(diag[i*nc2]),nc);
190 
191  for (i=0;i<nband;++i) {
192  nb=A->offsets[i];
193  nb1=abs(nb);
194  if (nb<0) {
195  for (j=0;j<ngrid-nb1;++j)
196  fasp_blas_smat_mul(&(diag[(j+nb1)*nc2]),&(A->offdiag[i][j*nc2]),&(B->offdiag[i][j*nc2]),nc);
197  }
198  else {
199  for (j=0;j<ngrid-nb1;++j)
200  fasp_blas_smat_mul(&(diag[j*nc2]),&(A->offdiag[i][j*nc2]),&(B->offdiag[i][j*nc2]),nc);
201  }
202 
203  }
204 
205  fasp_mem_free(diag);
206 
207  return (0);
208 }
209 
210 /*---------------------------------*/
211 /*-- Private Functions --*/
212 /*---------------------------------*/
213 
227 static inline void smat_amxv_nc3 (REAL alpha,
228  REAL *a,
229  REAL *b,
230  REAL *c)
231 {
232  c[0] += alpha*(a[0]*b[0] + a[1]*b[1] + a[2]*b[2]);
233  c[1] += alpha*(a[3]*b[0] + a[4]*b[1] + a[5]*b[2]);
234  c[2] += alpha*(a[6]*b[0] + a[7]*b[1] + a[8]*b[2]);
235 }
236 
250 static inline void smat_amxv_nc5 (REAL alpha,
251  REAL *a,
252  REAL *b,
253  REAL *c)
254 {
255  c[0] += alpha*(a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3] * b[3] + a[4] * b[4]);
256  c[1] += alpha*(a[5]*b[0] + a[6]*b[1] + a[7]*b[2] + a[8] * b[3] + a[9] * b[4]);
257  c[2] += alpha*(a[10]*b[0] + a[11]*b[1] + a[12]*b[2] + a[13] * b[3] + a[14] * b[4]);
258  c[3] += alpha*(a[15]*b[0] + a[16]*b[1] + a[17]*b[2] + a[18] * b[3] + a[19] * b[4]);
259  c[4] += alpha*(a[20]*b[0] + a[21]*b[1] + a[22]*b[2] + a[23] * b[3] + a[24] * b[4]);
260 }
261 
279 static inline void smat_amxv (REAL alpha,
280  REAL *a,
281  REAL *b,
282  INT n,
283  REAL *c)
284 {
285  INT i,j;
286  INT in;
287 
288 #ifdef _OPENMP
289  // variables for OpenMP
290  INT myid, mybegin, myend;
291  INT nthreads = FASP_GET_NUM_THREADS();
292 #endif
293 
294 #ifdef _OPENMP
295  if (n > OPENMP_HOLDS) {
296 #pragma omp parallel for private(myid, mybegin, myend, i, in, j)
297  for (myid=0; myid<nthreads; myid++) {
298  FASP_GET_START_END(myid, nthreads, n, &mybegin, &myend);
299  for (i=mybegin; i<myend; i++) {
300  in = i*n;
301  for (j=0; j<n; j++)
302  c[i] += alpha*a[in+j]*b[j];
303  }
304  }
305  }
306  else {
307 #endif
308  for (i=0;i<n;++i) {
309  in = i*n;
310  for (j=0;j<n;++j)
311  c[i] += alpha*a[in+j]*b[j];
312  }
313 #ifdef _OPENMP
314  }
315 #endif
316  return;
317 }
318 
341 static inline void blkcontr_str (INT start_data,
342  INT start_vecx,
343  INT start_vecy,
344  INT nc,
345  REAL *data,
346  REAL *x,
347  REAL *y)
348 {
349  INT i,j,k,m;
350 
351 #ifdef _OPENMP
352  //variables for OpenMP
353  INT myid, mybegin, myend;
354  INT nthreads = FASP_GET_NUM_THREADS();
355 #endif
356 
357 #ifdef _OPENMP
358  if (nc > OPENMP_HOLDS) {
359 #pragma omp parallel for private(myid, mybegin, myend, i, k, m, j)
360  for (myid = 0; myid < nthreads; myid++) {
361  FASP_GET_START_END(myid, nthreads, nc, &mybegin, &myend);
362  for (i = mybegin; i < myend; i ++) {
363  k = start_data + i*nc;
364  m = start_vecy + i;
365  for (j = 0; j < nc; j ++) {
366  y[m] += data[k+j]*x[start_vecx+j];
367  }
368  }
369  }
370  }
371  else {
372 #endif
373  for (i = 0; i < nc; i ++) {
374  k = start_data + i*nc;
375  m = start_vecy + i;
376  for (j = 0; j < nc; j ++) {
377  y[m] += data[k+j]*x[start_vecx+j];
378  }
379  }
380 #ifdef _OPENMP
381  }
382 #endif
383 }
384 
403 static inline void spaaxpy_str_2D_scalar (REAL alpha,
404  dSTRmat *A,
405  REAL *x,
406  REAL *y)
407 {
408  INT i;
409  INT idx1, idx2;
410  INT end1, end2;
411  INT nline;
412 
413 #ifdef _OPENMP
414  //variables for OpenMP
415  INT myid, mybegin, myend, idx;
416  INT nthreads = FASP_GET_NUM_THREADS();
417 #endif
418 
419  // information of A
420  INT nx = A->nx;
421  INT ngrid = A->ngrid; // number of grids
422  INT nband = A->nband;
423 
424  REAL *diag = A->diag;
425  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL, *offdiag3=NULL;
426 
427  if (nx == 1) {
428  nline = A->ny;
429  }
430  else {
431  nline = nx;
432  }
433 
434  for (i=0; i<nband; ++i) {
435  if (A->offsets[i] == -1) {
436  offdiag0 = A->offdiag[i];
437  }
438  else if (A->offsets[i] == 1) {
439  offdiag1 = A->offdiag[i];
440  }
441  else if (A->offsets[i] == -nline) {
442  offdiag2 = A->offdiag[i];
443  }
444  else if (A->offsets[i] == nline) {
445  offdiag3 = A->offdiag[i];
446  }
447  else {
448  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
449  spaaxpy_str_general(alpha, A, x, y);
450  return;
451  }
452  }
453 
454  end1 = ngrid-1;
455  end2 = ngrid-nline;
456 
457  y[0] += alpha*(diag[0]*x[0] + offdiag1[0]*x[1] + offdiag3[0]*x[nline]);
458 
459 #ifdef _OPENMP
460  if (nline-1 > OPENMP_HOLDS) {
461 #pragma omp parallel for private(myid, mybegin, myend, i, idx1, idx)
462  for (myid=0; myid<nthreads; myid++) {
463  FASP_GET_START_END(myid, nthreads, nline-1, &mybegin, &myend);
464  for (i=mybegin; i<myend; i++) {
465  idx1 = i;
466  idx = i+1;
467  y[idx] += alpha*(offdiag0[idx1]*x[idx1] + diag[idx]*x[idx] + offdiag1[idx]*x[idx+1] +
468  offdiag3[idx]*x[idx+nline]);
469  }
470  }
471  }
472  else {
473 #endif
474  for (i=1; i<nline; ++i) {
475  idx1 = i-1;
476  y[i] += alpha*(offdiag0[idx1]*x[idx1] + diag[i]*x[i] + offdiag1[i]*x[i+1] +
477  offdiag3[i]*x[i+nline]);
478  }
479 #ifdef _OPENMP
480  }
481 #endif
482 
483 #ifdef _OPENMP
484  if (end2-nline > OPENMP_HOLDS) {
485 #pragma omp parallel for private(myid, i, mybegin, myend, idx1, idx2, idx)
486  for (myid=0; myid<nthreads; myid++) {
487  FASP_GET_START_END(myid, nthreads, end2-nline, &mybegin, &myend);
488  for (i=mybegin; i<myend; ++i) {
489  idx = i+nline;
490  idx1 = idx-1; //idx1 = i-1+nline;
491  idx2 = i;
492  y[idx] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
493  diag[idx]*x[idx] + offdiag1[idx]*x[idx+1] + offdiag3[idx]*x[idx+nline]);
494  }
495  }
496  }
497  else {
498 #endif
499  for (i=nline; i<end2; ++i) {
500  idx1 = i-1;
501  idx2 = i-nline;
502  y[i] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
503  diag[i]*x[i] + offdiag1[i]*x[i+1] + offdiag3[i]*x[i+nline]);
504  }
505 #ifdef _OPENMP
506  }
507 #endif
508 
509 #ifdef _OPENMP
510  if (end1-end2 > OPENMP_HOLDS) {
511 #pragma omp parallel for private(myid, i, mybegin, myend, idx1, idx2, idx)
512  for (myid=0; myid<nthreads; myid++) {
513  FASP_GET_START_END(myid, nthreads, end1-end2, &mybegin, &myend);
514  for (i=mybegin; i<myend; ++i) {
515  idx = i+end2;
516  idx1 = idx-1; //idx1 = i-1+end2;
517  idx2 = idx-nline; //idx2 = i-nline+end2;
518  y[idx] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
519  diag[idx]*x[idx] + offdiag1[idx]*x[idx+1]);
520  }
521  }
522  }
523  else {
524 #endif
525  for (i=end2; i<end1; ++i) {
526  idx1 = i-1;
527  idx2 = i-nline;
528  y[i] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
529  diag[i]*x[i] + offdiag1[i]*x[i+1]);
530  }
531 #ifdef _OPENMP
532  }
533 #endif
534 
535  idx1 = end1-1;
536  idx2 = end1-nline;
537  y[end1] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] + diag[end1]*x[end1]);
538 
539  return;
540 
541 }
542 #if 0
543 
559 static inline void spaaxpy_str_2D_nc3 (REAL alpha,
560  dSTRmat *A,
561  REAL *x,
562  REAL *y)
563 {
564  INT i;
565  INT idx,idx1,idx2;
566  INT matidx, matidx1, matidx2;
567  INT end1, end2;
568  INT nline, nlinenc;
569 
570  for (i=nline; i<end2; ++i) {
571  idx1 = i-1;
572  idx2 = i-nline;
573  y[i] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
574  diag[i]*x[i] + offdiag1[i]*x[i+1] + offdiag3[i]*x[i+nline]);
575  }
576 
577  for (i=end2; i<end1; ++i) {
578  idx1 = i-1;
579  idx2 = i-nline;
580  y[i] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
581  diag[i]*x[i] + offdiag1[i]*x[i+1]);
582  }
583 
584  idx1 = end1-1;
585  idx2 = end1-nline;
586  y[end1] += alpha*(offdiag2[idx2]*x[idx2] +
587  offdiag0[idx1]*x[idx1] + diag[end1]*x[end1]);
588 
589  return;
590 
591 }
592 #endif
593 
612 static inline void spaaxpy_str_2D_nc3 (REAL alpha,
613  dSTRmat *A,
614  REAL *x,
615  REAL *y)
616 {
617  INT i;
618  INT idx,idx1,idx2;
619  INT matidx, matidx1, matidx2;
620  INT end1, end2;
621  INT nline, nlinenc;
622 
623  // information of A
624  INT nx = A->nx;
625  INT ngrid = A->ngrid; // number of grids
626  INT nc = A->nc;
627  INT nband = A->nband;
628 
629 #ifdef _OPENMP
630  // variables for OpenMP
631  INT myid, mybegin, myend, up;
632  INT nthreads = FASP_GET_NUM_THREADS();
633 #endif
634 
635  REAL *diag = A->diag;
636  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL, *offdiag3=NULL;
637 
638  if (nx == 1)
639  {
640  nline = A->ny;
641  }
642  else
643  {
644  nline = nx;
645  }
646  nlinenc = nline*nc;
647 
648  for (i=0; i<nband; ++i)
649  {
650  if (A->offsets[i] == -1)
651  {
652  offdiag0 = A->offdiag[i];
653  }
654  else if (A->offsets[i] == 1)
655  {
656  offdiag1 = A->offdiag[i];
657  }
658  else if (A->offsets[i] == -nline)
659  {
660  offdiag2 = A->offdiag[i];
661  }
662  else if (A->offsets[i] == nline)
663  {
664  offdiag3 = A->offdiag[i];
665  }
666  else
667  {
668  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
669  spaaxpy_str_general(alpha, A, x, y);
670  return;
671  }
672  }
673 
674  end1 = ngrid-1;
675  end2 = ngrid-nline;
676 
677  smat_amxv_nc3(alpha, diag, x, y);
678  smat_amxv_nc3(alpha, offdiag1, x+nc, y);
679  smat_amxv_nc3(alpha, offdiag3, x+nlinenc, y);
680 
681 #ifdef _OPENMP
682  up = nline - 1;
683  if (up > OPENMP_HOLDS) {
684 #pragma omp parallel for private(myid, mybegin, myend, i, idx, matidx, idx1, matidx1)
685  for (myid=0; myid<nthreads; myid++) {
686  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
687  for (i=mybegin; i<myend; i++) {
688  idx = (i+1)*nc;
689  matidx = idx*nc;
690  idx1 = i*nc;
691  matidx1 = idx1*nc;
692  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
693  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
694  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
695  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
696  }
697  }
698  }
699  else {
700 #endif
701  for (i=1; i<nline; ++i) {
702  idx = i*nc;
703  matidx = idx*nc;
704  idx1 = idx - nc;
705  matidx1 = idx1*nc;
706  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
707  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
708  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
709  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
710  }
711 #ifdef _OPENMP
712  }
713 #endif
714 
715 #ifdef _OPENMP
716  up = end2 - nx;
717  if (up > OPENMP_HOLDS) {
718 #pragma omp parallel for private(myid, mybegin, myend, idx, idx1, idx2, matidx, matidx1, matidx2)
719  for (myid=0; myid<nthreads; myid++) {
720  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
721  for (i=mybegin; i<myend; i++) {
722  idx = (i+nx)*nc;
723  idx1 = idx-nc;
724  idx2 = idx-nlinenc;
725  matidx = idx*nc;
726  matidx1 = idx1*nc;
727  matidx2 = idx2*nc;
728  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
729  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
730  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
731  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
732  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
733  }
734  }
735  }
736  else {
737 #endif
738  for (i=nx; i<end2; ++i) {
739  idx = i*nc;
740  idx1 = idx-nc;
741  idx2 = idx-nlinenc;
742  matidx = idx*nc;
743  matidx1 = idx1*nc;
744  matidx2 = idx2*nc;
745  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
746  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
747  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
748  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
749  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
750  }
751 #ifdef _OPENMP
752  }
753 #endif
754 
755 #ifdef _OPENMP
756  up = end1 - end2;
757  if (up > OPENMP_HOLDS) {
758 #pragma omp parallel for private(myid, mybegin, myend, idx, idx1, idx2, matidx, matidx1, matidx2)
759  for (myid=0; myid<nthreads; myid++) {
760  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
761  for (i=mybegin; i<myend; i++) {
762  idx = (i+end2)*nc;
763  idx1 = idx-nc;
764  idx2 = idx-nlinenc;
765  matidx = idx*nc;
766  matidx1 = idx1*nc;
767  matidx2 = idx2*nc;
768  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
769  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
770  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
771  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
772  }
773  }
774  }
775  else {
776 #endif
777  for (i=end2; i<end1; ++i) {
778  idx = i*nc;
779  idx1 = idx-nc;
780  idx2 = idx-nlinenc;
781  matidx = idx*nc;
782  matidx1 = idx1*nc;
783  matidx2 = idx2*nc;
784  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
785  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
786  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
787  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
788  }
789 #ifdef _OPENMP
790  }
791 #endif
792  i=end1;
793  idx = i*nc;
794  idx1 = idx-nc;
795  idx2 = idx-nlinenc;
796  matidx = idx*nc;
797  matidx1 = idx1*nc;
798  matidx2 = idx2*nc;
799  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
800  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
801  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
802 
803  return;
804 
805 }
806 
826 static inline void spaaxpy_str_2D_nc5(REAL alpha, dSTRmat *A, REAL *x, REAL *y)
827 {
828  INT i;
829  INT idx,idx1,idx2;
830  INT matidx, matidx1, matidx2;
831  INT end1, end2;
832  INT nline, nlinenc;
833 
834  // information of A
835  INT nx = A->nx;
836  INT ngrid = A->ngrid; // number of grids
837  INT nc = A->nc;
838  INT nband = A->nband;
839 
840 #ifdef _OPENMP
841  // variables for OpenMP
842  INT myid, mybegin, myend, up;
843  INT nthreads = FASP_GET_NUM_THREADS();
844 #endif
845 
846  REAL *diag = A->diag;
847  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL, *offdiag3=NULL;
848 
849  if (nx == 1)
850  {
851  nline = A->ny;
852  }
853  else
854  {
855  nline = nx;
856  }
857  nlinenc = nline*nc;
858 
859  for (i=0; i<nband; ++i)
860  {
861  if (A->offsets[i] == -1)
862  {
863  offdiag0 = A->offdiag[i];
864  }
865  else if (A->offsets[i] == 1)
866  {
867  offdiag1 = A->offdiag[i];
868  }
869  else if (A->offsets[i] == -nline)
870  {
871  offdiag2 = A->offdiag[i];
872  }
873  else if (A->offsets[i] == nline)
874  {
875  offdiag3 = A->offdiag[i];
876  }
877  else
878  {
879  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
880  spaaxpy_str_general(alpha, A, x, y);
881  return;
882  }
883  }
884 
885  end1 = ngrid-1;
886  end2 = ngrid-nline;
887 
888  smat_amxv_nc5(alpha, diag, x, y);
889  smat_amxv_nc5(alpha, offdiag1, x+nc, y);
890  smat_amxv_nc5(alpha, offdiag3, x+nlinenc, y);
891 
892 #ifdef _OPENMP
893  up = nline - 1;
894  if (up > OPENMP_HOLDS) {
895 #pragma omp parallel for private(myid, mybegin, myend, i, idx, matidx, idx1, matidx1)
896  for (myid=0; myid<nthreads; myid++) {
897  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
898  for (i=mybegin; i<myend; i++) {
899  idx = (i+1)*nc;
900  matidx = idx*nc;
901  idx1 = i*nc; // idx1 = idx - nc;
902  matidx1 = idx1*nc;
903  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
904  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
905  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
906  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
907  }
908  }
909  }
910  else {
911 #endif
912  for (i=1; i<nline; ++i) {
913  idx = i*nc;
914  matidx = idx*nc;
915  idx1 = idx - nc;
916  matidx1 = idx1*nc;
917  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
918  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
919  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
920  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
921  }
922 #ifdef _OPENMP
923  }
924 #endif
925 
926 #ifdef _OPENMP
927  up = end2 - nx;
928  if (up > OPENMP_HOLDS) {
929 #pragma omp parallel for private(myid, mybegin, myend, idx, idx1, idx2, matidx, matidx1, matidx2)
930  for (myid=0; myid<nthreads; myid++) {
931  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
932  for (i=mybegin; i<myend; i++) {
933  idx = (i+nx)*nc;
934  idx1 = idx-nc;
935  idx2 = idx-nlinenc;
936  matidx = idx*nc;
937  matidx1 = idx1*nc;
938  matidx2 = idx2*nc;
939  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
940  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
941  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
942  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
943  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
944  }
945  }
946  }
947  else {
948 #endif
949  for (i=nx; i<end2; ++i) {
950  idx = i*nc;
951  idx1 = idx-nc;
952  idx2 = idx-nlinenc;
953  matidx = idx*nc;
954  matidx1 = idx1*nc;
955  matidx2 = idx2*nc;
956  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
957  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
958  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
959  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
960  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nlinenc, y+idx);
961  }
962 #ifdef _OPENMP
963  }
964 #endif
965 
966 #ifdef _OPENMP
967  up = end1 - end2;
968  if (up > OPENMP_HOLDS) {
969 #pragma omp parallel for private(myid, mybegin, myend, idx, idx1, idx2, matidx, matidx1, matidx2)
970  for (myid=0; myid<nthreads; myid++) {
971  FASP_GET_START_END(myid, nthreads, up, &mybegin, &myend);
972  for (i=mybegin; i<myend; i++) {
973  idx = (i+end2)*nc;
974  idx1 = idx-nc;
975  idx2 = idx-nlinenc;
976  matidx = idx*nc;
977  matidx1 = idx1*nc;
978  matidx2 = idx2*nc;
979  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
980  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
981  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
982  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
983  }
984  }
985  }
986  else {
987 #endif
988  for (i=end2; i<end1; ++i) {
989  idx = i*nc;
990  idx1 = idx-nc;
991  idx2 = idx-nlinenc;
992  matidx = idx*nc;
993  matidx1 = idx1*nc;
994  matidx2 = idx2*nc;
995  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
996  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
997  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
998  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
999  }
1000 #ifdef _OPENMP
1001  }
1002 #endif
1003 
1004  i=end1;
1005  idx = i*nc;
1006  idx1 = idx-nc;
1007  idx2 = idx-nlinenc;
1008  matidx = idx*nc;
1009  matidx1 = idx1*nc;
1010  matidx2 = idx2*nc;
1011  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1012  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1013  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1014 
1015  return;
1016 
1017 }
1018 
1035 static inline void spaaxpy_str_2D_block (REAL alpha,
1036  dSTRmat *A,
1037  REAL *x,
1038  REAL *y)
1039 {
1040  INT i;
1041  INT idx,idx1,idx2;
1042  INT matidx, matidx1, matidx2;
1043  INT end1, end2;
1044  INT nline, nlinenc;
1045 
1046  // information of A
1047  INT nx = A->nx;
1048  INT ngrid = A->ngrid; // number of grids
1049  INT nc = A->nc;
1050  INT nband = A->nband;
1051 
1052  REAL *diag = A->diag;
1053  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL, *offdiag3=NULL;
1054 
1055  if (nx == 1)
1056  {
1057  nline = A->ny;
1058  }
1059  else
1060  {
1061  nline = nx;
1062  }
1063  nlinenc = nline*nc;
1064 
1065  for (i=0; i<nband; ++i)
1066  {
1067  if (A->offsets[i] == -1)
1068  {
1069  offdiag0 = A->offdiag[i];
1070  }
1071  else if (A->offsets[i] == 1)
1072  {
1073  offdiag1 = A->offdiag[i];
1074  }
1075  else if (A->offsets[i] == -nline)
1076  {
1077  offdiag2 = A->offdiag[i];
1078  }
1079  else if (A->offsets[i] == nline)
1080  {
1081  offdiag3 = A->offdiag[i];
1082  }
1083  else
1084  {
1085  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
1086  spaaxpy_str_general(alpha, A, x, y);
1087  return;
1088  }
1089  }
1090 
1091  end1 = ngrid-1;
1092  end2 = ngrid-nline;
1093 
1094  smat_amxv(alpha, diag, x, nc, y);
1095  smat_amxv(alpha, offdiag1, x+nc, nc, y);
1096  smat_amxv(alpha, offdiag3, x+nlinenc, nc, y);
1097 
1098  for (i=1; i<nline; ++i) {
1099  idx = i*nc;
1100  matidx = idx*nc;
1101  idx1 = idx - nc;
1102  matidx1 = idx1*nc;
1103  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1104  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1105  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1106  smat_amxv(alpha, offdiag3+matidx, x+idx+nlinenc, nc, y+idx);
1107  }
1108 
1109 
1110  for (i=nx; i<end2; ++i) {
1111  idx = i*nc;
1112  idx1 = idx-nc;
1113  idx2 = idx-nlinenc;
1114  matidx = idx*nc;
1115  matidx1 = idx1*nc;
1116  matidx2 = idx2*nc;
1117  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1118  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1119  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1120  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1121  smat_amxv(alpha, offdiag3+matidx, x+idx+nlinenc, nc, y+idx);
1122  }
1123 
1124  for (i=end2; i<end1; ++i) {
1125  idx = i*nc;
1126  idx1 = idx-nc;
1127  idx2 = idx-nlinenc;
1128  matidx = idx*nc;
1129  matidx1 = idx1*nc;
1130  matidx2 = idx2*nc;
1131  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1132  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1133  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1134  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1135  }
1136 
1137  i=end1;
1138  idx = i*nc;
1139  idx1 = idx-nc;
1140  idx2 = idx-nlinenc;
1141  matidx = idx*nc;
1142  matidx1 = idx1*nc;
1143  matidx2 = idx2*nc;
1144  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1145  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1146  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1147 
1148  return;
1149 
1150 }
1151 
1168 static inline void spaaxpy_str_3D_scalar (REAL alpha,
1169  dSTRmat *A,
1170  REAL *x,
1171  REAL *y)
1172 {
1173  INT i;
1174  INT idx1,idx2,idx3;
1175  INT end1, end2, end3;
1176  // information of A
1177  INT nx = A->nx;
1178  INT nxy = A->nxy;
1179  INT ngrid = A->ngrid; // number of grids
1180  INT nband = A->nband;
1181 
1182  REAL *diag = A->diag;
1183  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL,
1184  *offdiag3=NULL, *offdiag4=NULL, *offdiag5=NULL;
1185 
1186  for (i=0; i<nband; ++i)
1187  {
1188  if (A->offsets[i] == -1)
1189  {
1190  offdiag0 = A->offdiag[i];
1191  }
1192  else if (A->offsets[i] == 1)
1193  {
1194  offdiag1 = A->offdiag[i];
1195  }
1196  else if (A->offsets[i] == -nx)
1197  {
1198  offdiag2 = A->offdiag[i];
1199  }
1200  else if (A->offsets[i] == nx)
1201  {
1202  offdiag3 = A->offdiag[i];
1203  }
1204  else if (A->offsets[i] == -nxy)
1205  {
1206  offdiag4 = A->offdiag[i];
1207  }
1208  else if (A->offsets[i] == nxy)
1209  {
1210  offdiag5 = A->offdiag[i];
1211  }
1212  else
1213  {
1214  printf("### WARNING: offsets for 3D scalar case is illegal! %s\n", __FUNCTION__);
1215  spaaxpy_str_general(alpha, A, x, y);
1216  return;
1217  }
1218  }
1219 
1220  end1 = ngrid-1;
1221  end2 = ngrid-nx;
1222  end3 = ngrid-nxy;
1223 
1224  y[0] += alpha*(diag[0]*x[0] + offdiag1[0]*x[1] + offdiag3[0]*x[nx] + offdiag5[0]*x[nxy]);
1225 
1226  for (i=1; i<nx; ++i) {
1227  idx1 = i-1;
1228  y[i] += alpha*(offdiag0[idx1]*x[idx1] + diag[i]*x[i] + offdiag1[i]*x[i+1] +
1229  offdiag3[i]*x[i+nx] + offdiag5[i]*x[i+nxy]);
1230  }
1231 
1232  for (i=nx; i<nxy; ++i) {
1233  idx1 = i-1;
1234  idx2 = i-nx;
1235  y[i] += alpha*(offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] + diag[i]*x[i] +
1236  offdiag1[i]*x[i+1] + offdiag3[i]*x[i+nx] + offdiag5[i]*x[i+nxy]);
1237  }
1238 
1239  for (i=nxy; i<end3; ++i) {
1240  idx1 = i-1;
1241  idx2 = i-nx;
1242  idx3 = i-nxy;
1243  y[i] += alpha*(offdiag4[idx3]*x[idx3] + offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
1244  diag[i]*x[i] + offdiag1[i]*x[i+1] + offdiag3[i]*x[i+nx] + offdiag5[i]*x[i+nxy]);
1245  }
1246 
1247  for (i=end3; i<end2; ++i) {
1248  idx1 = i-1;
1249  idx2 = i-nx;
1250  idx3 = i-nxy;
1251  y[i] += alpha*(offdiag4[idx3]*x[idx3] + offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
1252  diag[i]*x[i] + offdiag1[i]*x[i+1] + offdiag3[i]*x[i+nx]);
1253  }
1254 
1255  for (i=end2; i<end1; ++i) {
1256  idx1 = i-1;
1257  idx2 = i-nx;
1258  idx3 = i-nxy;
1259  y[i] += alpha*(offdiag4[idx3]*x[idx3] + offdiag2[idx2]*x[idx2] + offdiag0[idx1]*x[idx1] +
1260  diag[i]*x[i] + offdiag1[i]*x[i+1]);
1261  }
1262 
1263  idx1 = end1-1;
1264  idx2 = end1-nx;
1265  idx3 = end1-nxy;
1266  y[end1] += alpha*(offdiag4[idx3]*x[idx3] + offdiag2[idx2]*x[idx2] +
1267  offdiag0[idx1]*x[idx1] + diag[end1]*x[end1]);
1268 
1269  return;
1270 
1271 }
1272 
1289 static inline void spaaxpy_str_3D_nc3 (REAL alpha,
1290  dSTRmat *A,
1291  REAL *x,
1292  REAL *y)
1293 {
1294  INT i;
1295  INT idx,idx1,idx2,idx3;
1296  INT matidx, matidx1, matidx2, matidx3;
1297  INT end1, end2, end3;
1298  // information of A
1299  INT nx = A->nx;
1300  INT nxy = A->nxy;
1301  INT ngrid = A->ngrid; // number of grids
1302  INT nc = A->nc;
1303  INT nxnc = nx*nc;
1304  INT nxync = nxy*nc;
1305  INT nband = A->nband;
1306 
1307  REAL *diag = A->diag;
1308  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL,
1309  *offdiag3=NULL, *offdiag4=NULL, *offdiag5=NULL;
1310 
1311  for (i=0; i<nband; ++i)
1312  {
1313  if (A->offsets[i] == -1)
1314  {
1315  offdiag0 = A->offdiag[i];
1316  }
1317  else if (A->offsets[i] == 1)
1318  {
1319  offdiag1 = A->offdiag[i];
1320  }
1321  else if (A->offsets[i] == -nx)
1322  {
1323  offdiag2 = A->offdiag[i];
1324  }
1325  else if (A->offsets[i] == nx)
1326  {
1327  offdiag3 = A->offdiag[i];
1328  }
1329  else if (A->offsets[i] == -nxy)
1330  {
1331  offdiag4 = A->offdiag[i];
1332  }
1333  else if (A->offsets[i] == nxy)
1334  {
1335  offdiag5 = A->offdiag[i];
1336  }
1337  else
1338  {
1339  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
1340  spaaxpy_str_general(alpha, A, x, y);
1341  return;
1342  }
1343  }
1344 
1345  end1 = ngrid-1;
1346  end2 = ngrid-nx;
1347  end3 = ngrid-nxy;
1348 
1349  smat_amxv_nc3(alpha, diag, x, y);
1350  smat_amxv_nc3(alpha, offdiag1, x+nc, y);
1351  smat_amxv_nc3(alpha, offdiag3, x+nxnc, y);
1352  smat_amxv_nc3(alpha, offdiag5, x+nxync, y);
1353 
1354  for (i=1; i<nx; ++i) {
1355  idx = i*nc;
1356  matidx = idx*nc;
1357  idx1 = idx - nc;
1358  matidx1 = idx1*nc;
1359  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1360  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1361  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1362  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1363  smat_amxv_nc3(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1364  }
1365 
1366  for (i=nx; i<nxy; ++i) {
1367  idx = i*nc;
1368  idx1 = idx-nc;
1369  idx2 = idx-nxnc;
1370  matidx = idx*nc;
1371  matidx1 = idx1*nc;
1372  matidx2 = idx2*nc;
1373  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
1374  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1375  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1376  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1377  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1378  smat_amxv_nc3(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1379 
1380  }
1381 
1382  for (i=nxy; i<end3; ++i) {
1383  idx = i*nc;
1384  idx1 = idx-nc;
1385  idx2 = idx-nxnc;
1386  idx3 = idx-nxync;
1387  matidx = idx*nc;
1388  matidx1 = idx1*nc;
1389  matidx2 = idx2*nc;
1390  matidx3 = idx3*nc;
1391  smat_amxv_nc3(alpha, offdiag4+matidx3, x+idx3, y+idx);
1392  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
1393  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1394  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1395  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1396  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1397  smat_amxv_nc3(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1398  }
1399 
1400  for (i=end3; i<end2; ++i) {
1401  idx = i*nc;
1402  idx1 = idx-nc;
1403  idx2 = idx-nxnc;
1404  idx3 = idx-nxync;
1405  matidx = idx*nc;
1406  matidx1 = idx1*nc;
1407  matidx2 = idx2*nc;
1408  matidx3 = idx3*nc;
1409  smat_amxv_nc3(alpha, offdiag4+matidx3, x+idx3, y+idx);
1410  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
1411  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1412  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1413  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1414  smat_amxv_nc3(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1415  }
1416 
1417  for (i=end2; i<end1; ++i) {
1418  idx = i*nc;
1419  idx1 = idx-nc;
1420  idx2 = idx-nxnc;
1421  idx3 = idx-nxync;
1422  matidx = idx*nc;
1423  matidx1 = idx1*nc;
1424  matidx2 = idx2*nc;
1425  matidx3 = idx3*nc;
1426  smat_amxv_nc3(alpha, offdiag4+matidx3, x+idx3, y+idx);
1427  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
1428  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1429  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1430  smat_amxv_nc3(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1431  }
1432 
1433  i=end1;
1434  idx = i*nc;
1435  idx1 = idx-nc;
1436  idx2 = idx-nxnc;
1437  idx3 = idx-nxync;
1438  matidx = idx*nc;
1439  matidx1 = idx1*nc;
1440  matidx2 = idx2*nc;
1441  matidx3 = idx3*nc;
1442  smat_amxv_nc3(alpha, offdiag4+matidx3, x+idx3, y+idx);
1443  smat_amxv_nc3(alpha, offdiag2+matidx2, x+idx2, y+idx);
1444  smat_amxv_nc3(alpha, offdiag0+matidx1, x+idx1, y+idx);
1445  smat_amxv_nc3(alpha, diag+matidx, x+idx, y+idx);
1446 
1447  return;
1448 
1449 }
1450 
1467 static inline void spaaxpy_str_3D_nc5 (REAL alpha,
1468  dSTRmat *A,
1469  REAL *x,
1470  REAL *y)
1471 {
1472  INT i;
1473  INT idx,idx1,idx2,idx3;
1474  INT matidx, matidx1, matidx2, matidx3;
1475  INT end1, end2, end3;
1476  // information of A
1477  INT nx = A->nx;
1478  INT nxy = A->nxy;
1479  INT ngrid = A->ngrid; // number of grids
1480  INT nc = A->nc;
1481  INT nxnc = nx*nc;
1482  INT nxync = nxy*nc;
1483  INT nband = A->nband;
1484 
1485  REAL *diag = A->diag;
1486  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL,
1487  *offdiag3=NULL, *offdiag4=NULL, *offdiag5=NULL;
1488 
1489  for (i=0; i<nband; ++i)
1490  {
1491  if (A->offsets[i] == -1)
1492  {
1493  offdiag0 = A->offdiag[i];
1494  }
1495  else if (A->offsets[i] == 1)
1496  {
1497  offdiag1 = A->offdiag[i];
1498  }
1499  else if (A->offsets[i] == -nx)
1500  {
1501  offdiag2 = A->offdiag[i];
1502  }
1503  else if (A->offsets[i] == nx)
1504  {
1505  offdiag3 = A->offdiag[i];
1506  }
1507  else if (A->offsets[i] == -nxy)
1508  {
1509  offdiag4 = A->offdiag[i];
1510  }
1511  else if (A->offsets[i] == nxy)
1512  {
1513  offdiag5 = A->offdiag[i];
1514  }
1515  else
1516  {
1517  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
1518  spaaxpy_str_general(alpha, A, x, y);
1519  return;
1520  }
1521  }
1522 
1523  end1 = ngrid-1;
1524  end2 = ngrid-nx;
1525  end3 = ngrid-nxy;
1526 
1527  smat_amxv_nc5(alpha, diag, x, y);
1528  smat_amxv_nc5(alpha, offdiag1, x+nc, y);
1529  smat_amxv_nc5(alpha, offdiag3, x+nxnc, y);
1530  smat_amxv_nc5(alpha, offdiag5, x+nxync, y);
1531 
1532  for (i=1; i<nx; ++i) {
1533  idx = i*nc;
1534  matidx = idx*nc;
1535  idx1 = idx - nc;
1536  matidx1 = idx1*nc;
1537  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1538  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1539  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1540  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1541  smat_amxv_nc5(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1542  }
1543 
1544  for (i=nx; i<nxy; ++i) {
1545  idx = i*nc;
1546  idx1 = idx-nc;
1547  idx2 = idx-nxnc;
1548  matidx = idx*nc;
1549  matidx1 = idx1*nc;
1550  matidx2 = idx2*nc;
1551  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1552  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1553  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1554  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1555  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1556  smat_amxv_nc5(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1557 
1558  }
1559 
1560  for (i=nxy; i<end3; ++i) {
1561  idx = i*nc;
1562  idx1 = idx-nc;
1563  idx2 = idx-nxnc;
1564  idx3 = idx-nxync;
1565  matidx = idx*nc;
1566  matidx1 = idx1*nc;
1567  matidx2 = idx2*nc;
1568  matidx3 = idx3*nc;
1569  smat_amxv_nc5(alpha, offdiag4+matidx3, x+idx3, y+idx);
1570  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1571  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1572  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1573  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1574  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1575  smat_amxv_nc5(alpha, offdiag5+matidx, x+idx+nxync, y+idx);
1576  }
1577 
1578  for (i=end3; i<end2; ++i) {
1579  idx = i*nc;
1580  idx1 = idx-nc;
1581  idx2 = idx-nxnc;
1582  idx3 = idx-nxync;
1583  matidx = idx*nc;
1584  matidx1 = idx1*nc;
1585  matidx2 = idx2*nc;
1586  matidx3 = idx3*nc;
1587  smat_amxv_nc5(alpha, offdiag4+matidx3, x+idx3, y+idx);
1588  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1589  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1590  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1591  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1592  smat_amxv_nc5(alpha, offdiag3+matidx, x+idx+nxnc, y+idx);
1593  }
1594 
1595  for (i=end2; i<end1; ++i) {
1596  idx = i*nc;
1597  idx1 = idx-nc;
1598  idx2 = idx-nxnc;
1599  idx3 = idx-nxync;
1600  matidx = idx*nc;
1601  matidx1 = idx1*nc;
1602  matidx2 = idx2*nc;
1603  matidx3 = idx3*nc;
1604  smat_amxv_nc5(alpha, offdiag4+matidx3, x+idx3, y+idx);
1605  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1606  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1607  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1608  smat_amxv_nc5(alpha, offdiag1+matidx, x+idx+nc, y+idx);
1609  }
1610 
1611  i=end1;
1612  idx = i*nc;
1613  idx1 = idx-nc;
1614  idx2 = idx-nxnc;
1615  idx3 = idx-nxync;
1616  matidx = idx*nc;
1617  matidx1 = idx1*nc;
1618  matidx2 = idx2*nc;
1619  matidx3 = idx3*nc;
1620  smat_amxv_nc5(alpha, offdiag4+matidx3, x+idx3, y+idx);
1621  smat_amxv_nc5(alpha, offdiag2+matidx2, x+idx2, y+idx);
1622  smat_amxv_nc5(alpha, offdiag0+matidx1, x+idx1, y+idx);
1623  smat_amxv_nc5(alpha, diag+matidx, x+idx, y+idx);
1624 
1625  return;
1626 
1627 }
1628 
1645 static inline void spaaxpy_str_3D_block (REAL alpha,
1646  dSTRmat *A,
1647  REAL *x,
1648  REAL *y)
1649 {
1650  INT i;
1651  INT idx,idx1,idx2,idx3;
1652  INT matidx, matidx1, matidx2, matidx3;
1653  INT end1, end2, end3;
1654  // information of A
1655  INT nx = A->nx;
1656  INT nxy = A->nxy;
1657  INT ngrid = A->ngrid; // number of grids
1658  INT nc = A->nc;
1659  INT nxnc = nx*nc;
1660  INT nxync = nxy*nc;
1661  INT nband = A->nband;
1662 
1663  REAL *diag = A->diag;
1664  REAL *offdiag0=NULL, *offdiag1=NULL, *offdiag2=NULL,
1665  *offdiag3=NULL, *offdiag4=NULL, *offdiag5=NULL;
1666 
1667  for (i=0; i<nband; ++i)
1668  {
1669  if (A->offsets[i] == -1)
1670  {
1671  offdiag0 = A->offdiag[i];
1672  }
1673  else if (A->offsets[i] == 1)
1674  {
1675  offdiag1 = A->offdiag[i];
1676  }
1677  else if (A->offsets[i] == -nx)
1678  {
1679  offdiag2 = A->offdiag[i];
1680  }
1681  else if (A->offsets[i] == nx)
1682  {
1683  offdiag3 = A->offdiag[i];
1684  }
1685  else if (A->offsets[i] == -nxy)
1686  {
1687  offdiag4 = A->offdiag[i];
1688  }
1689  else if (A->offsets[i] == nxy)
1690  {
1691  offdiag5 = A->offdiag[i];
1692  }
1693  else
1694  {
1695  printf("### WARNING: offsets for 2D scalar case is illegal! %s\n", __FUNCTION__);
1696  spaaxpy_str_general(alpha, A, x, y);
1697  return;
1698  }
1699  }
1700 
1701  end1 = ngrid-1;
1702  end2 = ngrid-nx;
1703  end3 = ngrid-nxy;
1704 
1705  smat_amxv(alpha, diag, x, nc, y);
1706  smat_amxv(alpha, offdiag1, x+nc, nc, y);
1707  smat_amxv(alpha, offdiag3, x+nxnc, nc, y);
1708  smat_amxv(alpha, offdiag5, x+nxync, nc, y);
1709 
1710  for (i=1; i<nx; ++i) {
1711  idx = i*nc;
1712  matidx = idx*nc;
1713  idx1 = idx - nc;
1714  matidx1 = idx1*nc;
1715  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1716  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1717  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1718  smat_amxv(alpha, offdiag3+matidx, x+idx+nxnc, nc, y+idx);
1719  smat_amxv(alpha, offdiag5+matidx, x+idx+nxync, nc, y+idx);
1720  }
1721 
1722  for (i=nx; i<nxy; ++i) {
1723  idx = i*nc;
1724  idx1 = idx-nc;
1725  idx2 = idx-nxnc;
1726  matidx = idx*nc;
1727  matidx1 = idx1*nc;
1728  matidx2 = idx2*nc;
1729  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1730  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1731  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1732  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1733  smat_amxv(alpha, offdiag3+matidx, x+idx+nxnc, nc, y+idx);
1734  smat_amxv(alpha, offdiag5+matidx, x+idx+nxync, nc, y+idx);
1735 
1736  }
1737 
1738  for (i=nxy; i<end3; ++i) {
1739  idx = i*nc;
1740  idx1 = idx-nc;
1741  idx2 = idx-nxnc;
1742  idx3 = idx-nxync;
1743  matidx = idx*nc;
1744  matidx1 = idx1*nc;
1745  matidx2 = idx2*nc;
1746  matidx3 = idx3*nc;
1747  smat_amxv(alpha, offdiag4+matidx3, x+idx3, nc, y+idx);
1748  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1749  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1750  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1751  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1752  smat_amxv(alpha, offdiag3+matidx, x+idx+nxnc, nc, y+idx);
1753  smat_amxv(alpha, offdiag5+matidx, x+idx+nxync, nc, y+idx);
1754  }
1755 
1756  for (i=end3; i<end2; ++i) {
1757  idx = i*nc;
1758  idx1 = idx-nc;
1759  idx2 = idx-nxnc;
1760  idx3 = idx-nxync;
1761  matidx = idx*nc;
1762  matidx1 = idx1*nc;
1763  matidx2 = idx2*nc;
1764  matidx3 = idx3*nc;
1765  smat_amxv(alpha, offdiag4+matidx3, x+idx3, nc, y+idx);
1766  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1767  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1768  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1769  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1770  smat_amxv(alpha, offdiag3+matidx, x+idx+nxnc, nc, y+idx);
1771  }
1772 
1773  for (i=end2; i<end1; ++i) {
1774  idx = i*nc;
1775  idx1 = idx-nc;
1776  idx2 = idx-nxnc;
1777  idx3 = idx-nxync;
1778  matidx = idx*nc;
1779  matidx1 = idx1*nc;
1780  matidx2 = idx2*nc;
1781  matidx3 = idx3*nc;
1782  smat_amxv(alpha, offdiag4+matidx3, x+idx3, nc, y+idx);
1783  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1784  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1785  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1786  smat_amxv(alpha, offdiag1+matidx, x+idx+nc, nc, y+idx);
1787  }
1788 
1789  i=end1;
1790  idx = i*nc;
1791  idx1 = idx-nc;
1792  idx2 = idx-nxnc;
1793  idx3 = idx-nxync;
1794  matidx = idx*nc;
1795  matidx1 = idx1*nc;
1796  matidx2 = idx2*nc;
1797  matidx3 = idx3*nc;
1798  smat_amxv(alpha, offdiag4+matidx3, x+idx3, nc, y+idx);
1799  smat_amxv(alpha, offdiag2+matidx2, x+idx2, nc, y+idx);
1800  smat_amxv(alpha, offdiag0+matidx1, x+idx1, nc, y+idx);
1801  smat_amxv(alpha, diag+matidx, x+idx, nc, y+idx);
1802 
1803  return;
1804 
1805 }
1806 
1821 static inline void spaaxpy_str_general (REAL alpha,
1822  dSTRmat *A,
1823  REAL *x,
1824  REAL *y)
1825 {
1826  // information of A
1827  INT ngrid = A->ngrid; // number of grids
1828  INT nc = A->nc; // size of each block (number of components)
1829  INT nband = A->nband ; // number of off-diag band
1830  INT *offsets = A->offsets; // offsets of the off-diagals
1831  REAL *diag = A->diag; // Diagonal entries
1832  REAL **offdiag = A->offdiag; // Off-diagonal entries
1833 
1834  // local variables
1835  INT k;
1836  INT block = 0;
1837  INT point = 0;
1838  INT band = 0;
1839  INT width = 0;
1840  INT size = nc*ngrid;
1841  INT nc2 = nc*nc;
1842  INT ncw = 0;
1843  INT start_data = 0;
1844  INT start_vecx = 0;
1845  INT start_vecy = 0;
1846  INT start_vect = 0;
1847  REAL beta = 0.0;
1848 
1849  if (alpha == 0) {
1850  return; // nothing should be done
1851  }
1852 
1853  beta = 1.0/alpha;
1854 
1855  // y: = beta*y
1856  for (k = 0; k < size; ++k) {
1857  y[k] *= beta;
1858  }
1859 
1860  // y: = y + A*x
1861  if (nc > 1) {
1862  // Deal with the diagonal band
1863  for (block = 0; block < ngrid; ++block) {
1864  start_data = nc2*block;
1865  start_vect = nc*block;
1866  blkcontr_str(start_data,start_vect,start_vect,nc,diag,x,y);
1867  }
1868 
1869  // Deal with the off-diagonal bands
1870  for (band = 0; band < nband; band ++) {
1871  width = offsets[band];
1872  ncw = nc*width;
1873  if (width < 0) {
1874  for (block = 0; block < ngrid+width; ++block) {
1875  start_data = nc2*block;
1876  start_vecx = nc*block;
1877  start_vecy = start_vecx - ncw;
1878  blkcontr_str(start_data,start_vecx,start_vecy,nc,offdiag[band],x,y);
1879  }
1880  }
1881  else {
1882  for (block = 0; block < ngrid-width; ++block) {
1883  start_data = nc2*block;
1884  start_vecy = nc*block;
1885  start_vecx = start_vecy + ncw;
1886  blkcontr_str(start_data,start_vecx,start_vecy,nc,offdiag[band],x,y);
1887  }
1888  }
1889  }
1890  }
1891  else if (nc == 1) {
1892  // Deal with the diagonal band
1893  for (point = 0; point < ngrid; point ++) {
1894  y[point] += diag[point]*x[point];
1895  }
1896 
1897  // Deal with the off-diagonal bands
1898  for (band = 0; band < nband; band ++) {
1899  width = offsets[band];
1900  if (width < 0) {
1901  for (point = 0; point < ngrid+width; point ++) {
1902  y[point-width] += offdiag[band][point]*x[point];
1903  }
1904  }
1905  else {
1906  for (point = 0; point < ngrid-width; point ++) {
1907  y[point] += offdiag[band][point]*x[point+width];
1908  }
1909  }
1910  }
1911  }
1912  else {
1913  printf("### WARNING: nc is illegal! %s\n", __FUNCTION__);
1914  return;
1915  }
1916 
1917  // y: = alpha*y
1918  for (k = 0; k < size; ++k) {
1919  y[k] *= alpha;
1920  }
1921 }
1922 
1923 /*---------------------------------*/
1924 /*-- End of File --*/
1925 /*---------------------------------*/
INT nx
number of grids in x direction
Definition: fasp.h:307
#define REAL
Definition: fasp.h:67
void fasp_blas_dstr_aAxpy(const REAL alpha, dSTRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y.
Definition: blas_str.c:47
INT ngrid
number of grids
Definition: fasp.h:322
INT fasp_blas_smat_inv(REAL *a, const INT n)
Compute the inverse matrix of a small full matrix a.
Definition: smat.c:554
INT nc
size of each block (number of components)
Definition: fasp.h:319
REAL ** offdiag
off-diagonal entries (dimension is nband * [(ngrid-|offsets|) * nc^2])
Definition: fasp.h:334
INT * offsets
offsets of the off-diagonals (length is nband)
Definition: fasp.h:331
Structure matrix of REAL type.
Definition: fasp.h:304
INT nz
number of grids in z direction
Definition: fasp.h:313
void * fasp_mem_calloc(LONGLONG size, INT type)
1M = 1024*1024
Definition: memory.c:60
void fasp_blas_smat_mul(REAL *a, REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
Definition: blas_smat.c:448
#define INT
Definition: fasp.h:64
INT nband
number of off-diag bands
Definition: fasp.h:328
void FASP_GET_START_END(INT procid, INT nprocs, INT n, INT *start, INT *end)
Assign Load to each thread.
Definition: threads.c:83
void fasp_mem_free(void *mem)
Free up previous allocated memory body.
Definition: memory.c:150
#define OPENMP_HOLDS
Definition: fasp_const.h:248
void fasp_blas_dstr_mxv(dSTRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = A*x.
Definition: blas_str.c:117
Main header file for FASP.
REAL * diag
diagonal entries (length is ngrid*(nc^2))
Definition: fasp.h:325
void fasp_dstr_alloc(const INT nx, const INT ny, const INT nz, const INT nxy, const INT ngrid, const INT nband, const INT nc, INT *offsets, dSTRmat *A)
Allocate STR sparse matrix memory space.
Definition: sparse_str.c:109
INT nxy
number of grids on x-y plane
Definition: fasp.h:316
void fasp_array_cp(const INT n, REAL *x, REAL *y)
Copy an array to the other y=x.
Definition: array.c:165
INT ny
number of grids in y direction
Definition: fasp.h:310
INT fasp_dstr_diagscale(dSTRmat *A, dSTRmat *B)
B=D^{-1}A.
Definition: blas_str.c:142