Fast Auxiliary Space Preconditioning  1.8.4 Feb/15/2016
blas_csr.c
Go to the documentation of this file.
1 
15 #include <math.h>
16 #include <time.h>
17 
18 #ifdef _OPENMP
19 #include <omp.h>
20 #endif
21 
22 #include "fasp.h"
23 #include "fasp_functs.h"
24 
25 /*---------------------------------*/
26 /*-- Public Functions --*/
27 /*---------------------------------*/
28 
49  const REAL alpha,
50  dCSRmat *B,
51  const REAL beta,
52  dCSRmat *C)
53 {
54  INT i,j,k,l;
55  INT count=0, added, countrow;
56  INT status = FASP_SUCCESS;
57 
58  INT use_openmp = FALSE;
59 
60 #ifdef _OPENMP
61  INT mybegin, myend, myid, nthreads;
62  if ( A->nnz > OPENMP_HOLDS ) {
63  use_openmp = TRUE;
64  nthreads = FASP_GET_NUM_THREADS();
65  }
66 #endif
67 
68  if (A->row != B->row || A->col != B->col) {
69  printf("### ERROR: Dimensions of A & B do not match! %s\n", __FUNCTION__);
70  status = ERROR_DATA_STRUCTURE;
71  goto FINISHED;
72  }
73 
74  if (A == NULL && B == NULL) {
75  C->row=0; C->col=0; C->nnz=0;
76  status=FASP_SUCCESS; goto FINISHED;
77  }
78 
79  if (A->nnz == 0 && B->nnz == 0) {
80  C->row=A->row; C->col=A->col; C->nnz=A->nnz;
81  status=FASP_SUCCESS; goto FINISHED;
82  }
83 
84  // empty matrix A
85  if (A->nnz == 0 || A == NULL) {
86  fasp_dcsr_alloc(B->row,B->col,B->nnz,C);
87  memcpy(C->IA,B->IA,(B->row+1)*sizeof(INT));
88  memcpy(C->JA,B->JA,(B->nnz)*sizeof(INT));
89 
90  if (use_openmp) {
91 #ifdef _OPENMP
92 #pragma omp parallel private(myid, mybegin, myend, i)
93  {
94  myid = omp_get_thread_num();
95  FASP_GET_START_END(myid, nthreads, A->nnz, &mybegin, &myend);
96  for (i=mybegin;i<myend;++i) C->val[i]=B->val[i]*beta;
97  }
98 #endif
99  }
100  else {
101  for (i=0;i<A->nnz;++i) C->val[i]=B->val[i]*beta;
102  }
103 
104  status=FASP_SUCCESS;
105  goto FINISHED;
106  }
107 
108  // empty matrix B
109  if (B->nnz == 0 || B == NULL) {
110  fasp_dcsr_alloc(A->row,A->col,A->nnz,C);
111  memcpy(C->IA,A->IA,(A->row+1)*sizeof(INT));
112  memcpy(C->JA,A->JA,(A->nnz)*sizeof(INT));
113 
114  if (use_openmp) {
115 #ifdef _OPENMP
116  INT mybegin, myend, myid;
117 #pragma omp parallel private(myid, mybegin, myend, i)
118  {
119  myid = omp_get_thread_num();
120  FASP_GET_START_END(myid, nthreads, A->nnz, &mybegin, &myend);
121  for (i=mybegin;i<myend;++i) C->val[i]=A->val[i]*alpha;
122  }
123 #endif
124  }
125  else {
126  for (i=0;i<A->nnz;++i) C->val[i]=A->val[i]*alpha;
127  }
128 
129  status=FASP_SUCCESS; goto FINISHED;
130  }
131 
132  C->row=A->row; C->col=A->col;
133 
134  C->IA=(INT*)fasp_mem_calloc(C->row+1,sizeof(INT));
135 
136  // allocate work space for C->JA and C->val
137  C->JA=(INT *)fasp_mem_calloc(A->nnz+B->nnz,sizeof(INT));
138 
139  C->val=(REAL *)fasp_mem_calloc(A->nnz+B->nnz,sizeof(REAL));
140 
141  // initial C->IA
142  memset(C->IA, 0, sizeof(INT)*(C->row+1));
143  memset(C->JA, -1, sizeof(INT)*(A->nnz+B->nnz));
144 
145 
146  for (i=0; i<A->row; ++i) {
147  countrow = 0;
148  for (j=A->IA[i]; j<A->IA[i+1]; ++j) {
149  C->val[count] = alpha * A->val[j];
150  C->JA[count] = A->JA[j];
151  C->IA[i+1]++;
152  count++;
153  countrow++;
154  } // end for js
155 
156  for (k=B->IA[i]; k<B->IA[i+1]; ++k) {
157  added = 0;
158 
159  for (l=C->IA[i]; l<C->IA[i]+countrow+1; l++) {
160  if (B->JA[k] == C->JA[l]) {
161  C->val[l] = C->val[l] + beta * B->val[k];
162  added = 1;
163  break;
164  }
165  } // end for l
166 
167  if (added == 0) {
168  C->val[count] = beta * B->val[k];
169  C->JA[count] = B->JA[k];
170  C->IA[i+1]++;
171  count++;
172  }
173 
174  } // end for k
175 
176  C->IA[i+1] += C->IA[i];
177 
178  }
179 
180  C->nnz = count;
181  C->JA = (INT *)fasp_mem_realloc(C->JA, (count)*sizeof(INT));
182  C->val = (REAL *)fasp_mem_realloc(C->val, (count)*sizeof(REAL));
183 
184 FINISHED:
185  return status;
186 }
187 
202  const REAL alpha)
203 {
204  const INT nnz=A->nnz;
205 
206  // A direct calculation can be written as:
207  fasp_blas_array_ax(nnz, alpha, A->val);
208 }
209 
226  REAL *x,
227  REAL *y)
228 {
229  const INT m=A->row;
230  const INT *ia=A->IA, *ja=A->JA;
231  const REAL *aj=A->val;
232  INT i, k, begin_row, end_row, nnz_num_row;
233  register REAL temp;
234 
235  INT nthreads = 1, use_openmp = FALSE;
236 
237 #ifdef _OPENMP
238  if ( m > OPENMP_HOLDS ) {
239  use_openmp = TRUE;
240  nthreads = FASP_GET_NUM_THREADS();
241  }
242 #endif
243 
244  if (use_openmp) {
245  INT myid, mybegin, myend;
246 
247 #ifdef _OPENMP
248 #pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, nnz_num_row, k)
249 #endif
250  for (myid = 0; myid < nthreads; myid++ ) {
251  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
252  for (i=mybegin; i<myend; ++i) {
253  temp=0.0;
254  begin_row = ia[i];
255  end_row = ia[i+1];
256  nnz_num_row = end_row - begin_row;
257  switch(nnz_num_row) {
258  case 3:
259  k = begin_row;
260  temp += aj[k]*x[ja[k]];
261  k ++;
262  temp += aj[k]*x[ja[k]];
263  k ++;
264  temp += aj[k]*x[ja[k]];
265  break;
266  case 4:
267  k = begin_row;
268  temp += aj[k]*x[ja[k]];
269  k ++;
270  temp += aj[k]*x[ja[k]];
271  k ++;
272  temp += aj[k]*x[ja[k]];
273  k ++;
274  temp += aj[k]*x[ja[k]];
275  break;
276  case 5:
277  k = begin_row;
278  temp += aj[k]*x[ja[k]];
279  k ++;
280  temp += aj[k]*x[ja[k]];
281  k ++;
282  temp += aj[k]*x[ja[k]];
283  k ++;
284  temp += aj[k]*x[ja[k]];
285  k ++;
286  temp += aj[k]*x[ja[k]];
287  break;
288  case 6:
289  k = begin_row;
290  temp += aj[k]*x[ja[k]];
291  k ++;
292  temp += aj[k]*x[ja[k]];
293  k ++;
294  temp += aj[k]*x[ja[k]];
295  k ++;
296  temp += aj[k]*x[ja[k]];
297  k ++;
298  temp += aj[k]*x[ja[k]];
299  k ++;
300  temp += aj[k]*x[ja[k]];
301  break;
302  case 7:
303  k = begin_row;
304  temp += aj[k]*x[ja[k]];
305  k ++;
306  temp += aj[k]*x[ja[k]];
307  k ++;
308  temp += aj[k]*x[ja[k]];
309  k ++;
310  temp += aj[k]*x[ja[k]];
311  k ++;
312  temp += aj[k]*x[ja[k]];
313  k ++;
314  temp += aj[k]*x[ja[k]];
315  k ++;
316  temp += aj[k]*x[ja[k]];
317  break;
318  default:
319  for (k=begin_row; k<end_row; ++k) {
320  temp += aj[k]*x[ja[k]];
321  }
322  break;
323  }
324  y[i]=temp;
325  }
326  }
327  }
328 
329  else {
330  for (i=0;i<m;++i) {
331  temp=0.0;
332  begin_row=ia[i];
333  end_row=ia[i+1];
334  nnz_num_row = end_row - begin_row;
335  switch(nnz_num_row) {
336  case 3:
337  k=begin_row;
338  temp+=aj[k]*x[ja[k]];
339  k ++;
340  temp+=aj[k]*x[ja[k]];
341  k ++;
342  temp+=aj[k]*x[ja[k]];
343  break;
344  case 4:
345  k=begin_row;
346  temp+=aj[k]*x[ja[k]];
347  k ++;
348  temp+=aj[k]*x[ja[k]];
349  k ++;
350  temp+=aj[k]*x[ja[k]];
351  k ++;
352  temp+=aj[k]*x[ja[k]];
353  break;
354  case 5:
355  k=begin_row;
356  temp+=aj[k]*x[ja[k]];
357  k ++;
358  temp+=aj[k]*x[ja[k]];
359  k ++;
360  temp+=aj[k]*x[ja[k]];
361  k ++;
362  temp+=aj[k]*x[ja[k]];
363  k ++;
364  temp+=aj[k]*x[ja[k]];
365  break;
366  case 6:
367  k=begin_row;
368  temp+=aj[k]*x[ja[k]];
369  k ++;
370  temp+=aj[k]*x[ja[k]];
371  k ++;
372  temp+=aj[k]*x[ja[k]];
373  k ++;
374  temp+=aj[k]*x[ja[k]];
375  k ++;
376  temp+=aj[k]*x[ja[k]];
377  k ++;
378  temp+=aj[k]*x[ja[k]];
379  break;
380  case 7:
381  k=begin_row;
382  temp+=aj[k]*x[ja[k]];
383  k ++;
384  temp+=aj[k]*x[ja[k]];
385  k ++;
386  temp+=aj[k]*x[ja[k]];
387  k ++;
388  temp+=aj[k]*x[ja[k]];
389  k ++;
390  temp+=aj[k]*x[ja[k]];
391  k ++;
392  temp+=aj[k]*x[ja[k]];
393  k ++;
394  temp+=aj[k]*x[ja[k]];
395  break;
396  default:
397  for (k=begin_row; k<end_row; ++k) {
398  temp+=aj[k]*x[ja[k]];
399  }
400  break;
401  }
402  y[i]=temp;
403  }
404  }
405 }
406 
407 
424  REAL *x,
425  REAL *y)
426 {
427  const INT m = A->row;
428  const INT *ia = A->IA, *ja = A->JA;
429  INT i, k, begin_row, end_row;
430  register REAL temp;
431 
432 #ifdef _OPENMP
433  // variables for OpenMP
434  INT myid, mybegin, myend;
435  INT nthreads = FASP_GET_NUM_THREADS();
436 #endif
437 
438 #ifdef _OPENMP
439  if (m > OPENMP_HOLDS) {
440 #pragma omp parallel for private(myid, i, mybegin, myend, temp, begin_row, end_row, k)
441  for (myid=0; myid<nthreads; myid++) {
442  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
443  for (i=mybegin; i<myend; i++) {
444  temp=0.0;
445  begin_row=ia[i]; end_row=ia[i+1];
446  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
447  y[i]=temp;
448  }
449  }
450  }
451  else {
452 #endif
453  for (i=0;i<m;++i) {
454  temp=0.0;
455  begin_row=ia[i]; end_row=ia[i+1];
456  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
457  y[i]=temp;
458  }
459 #ifdef _OPENMP
460  }
461 #endif
462 }
463 
479 void fasp_blas_dcsr_aAxpy (const REAL alpha,
480  dCSRmat *A,
481  REAL *x,
482  REAL *y)
483 {
484  const INT m = A->row;
485  const INT *ia = A->IA, *ja = A->JA;
486  const REAL *aj = A->val;
487  INT i, k, begin_row, end_row;
488  register REAL temp;
489 
490  INT nthreads = 1, use_openmp = FALSE;
491 
492 #ifdef _OPENMP
493  if ( m > OPENMP_HOLDS ) {
494  use_openmp = TRUE;
495  nthreads = FASP_GET_NUM_THREADS();
496  }
497 #endif
498 
499  if ( alpha == 1.0 ) {
500  if (use_openmp) {
501  INT myid, mybegin, myend;
502 #ifdef _OPENMP
503 #pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
504 #endif
505  for (myid = 0; myid < nthreads; myid++ ) {
506  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
507  for (i=mybegin; i<myend; ++i) {
508  temp=0.0;
509  begin_row=ia[i]; end_row=ia[i+1];
510  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
511  y[i]+=temp;
512  }
513  }
514  }
515  else {
516  for (i=0;i<m;++i) {
517  temp=0.0;
518  begin_row=ia[i]; end_row=ia[i+1];
519  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
520  y[i]+=temp;
521  }
522  }
523  }
524 
525  else if ( alpha == -1.0 ) {
526  if (use_openmp) {
527  INT myid, mybegin, myend;
528 #ifdef _OPENMP
529 #pragma omp parallel for private(myid, mybegin, myend, temp, i, begin_row, end_row, k)
530 #endif
531  for (myid = 0; myid < nthreads; myid++ ) {
532  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
533  for (i=mybegin; i<myend; ++i) {
534  temp=0.0;
535  begin_row=ia[i]; end_row=ia[i+1];
536  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
537  y[i]-=temp;
538  }
539  }
540  }
541  else {
542  for (i=0;i<m;++i) {
543  temp=0.0;
544  begin_row=ia[i]; end_row=ia[i+1];
545  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
546  y[i]-=temp;
547  }
548  }
549  }
550 
551  else {
552  if (use_openmp) {
553  INT myid, mybegin, myend;
554 #ifdef _OPENMP
555 #pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
556 #endif
557  for (myid = 0; myid < nthreads; myid++ ) {
558  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
559  for (i=mybegin; i<myend; ++i) {
560  temp=0.0;
561  begin_row=ia[i]; end_row=ia[i+1];
562  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
563  y[i]+=temp*alpha;
564  }
565  }
566  }
567  else {
568  for (i=0;i<m;++i) {
569  temp=0.0;
570  begin_row=ia[i]; end_row=ia[i+1];
571  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
572  y[i]+=temp*alpha;
573  }
574  }
575  }
576 }
577 
593 void fasp_blas_dcsr_aAxpy_agg (const REAL alpha,
594  dCSRmat *A,
595  REAL *x,
596  REAL *y)
597 {
598  const INT m = A->row;
599  const INT *ia = A->IA, *ja = A->JA;
600 
601  INT i, k, begin_row, end_row;
602  register REAL temp;
603 
604  if ( alpha == 1.0 ) {
605 #ifdef _OPENMP
606  if (m > OPENMP_HOLDS) {
607  INT myid, mybegin, myend;
608  INT nthreads = FASP_GET_NUM_THREADS();
609 #pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
610  for (myid = 0; myid < nthreads; myid++) {
611  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
612  for (i = mybegin; i < myend; ++i) {
613  temp=0.0;
614  begin_row=ia[i]; end_row=ia[i+1];
615  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
616  y[i]+=temp;
617  }
618  }
619  }
620  else {
621 #endif
622  for (i=0;i<m;++i) {
623  temp=0.0;
624  begin_row=ia[i]; end_row=ia[i+1];
625  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
626  y[i]+=temp;
627  }
628 #ifdef _OPENMP
629  }
630 #endif
631  }
632  else if ( alpha == -1.0 ) {
633 #ifdef _OPENMP
634  if (m > OPENMP_HOLDS) {
635  INT myid, mybegin, myend;
636  INT nthreads = FASP_GET_NUM_THREADS();
637 #pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
638  for (myid = 0; myid < nthreads; myid++) {
639  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
640  for (i = mybegin; i < myend; ++i) {
641  temp=0.0;
642  begin_row=ia[i]; end_row=ia[i+1];
643  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
644  y[i]-=temp;
645  }
646  }
647  }
648  else {
649 #endif
650  for (i=0;i<m;++i) {
651  temp=0.0;
652  begin_row=ia[i]; end_row=ia[i+1];
653  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
654  y[i]-=temp;
655  }
656 #ifdef _OPENMP
657  }
658 #endif
659  }
660 
661  else {
662 #ifdef _OPENMP
663  if (m > OPENMP_HOLDS) {
664  INT myid, mybegin, myend;
665  INT nthreads = FASP_GET_NUM_THREADS();
666 #pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
667  for (myid = 0; myid < nthreads; myid++) {
668  FASP_GET_START_END(myid, nthreads, m, &mybegin, &myend);
669  for (i = mybegin; i < myend; ++i) {
670  temp=0.0;
671  begin_row=ia[i]; end_row=ia[i+1];
672  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
673  y[i]+=temp*alpha;
674  }
675  }
676  }
677  else {
678 #endif
679  for (i=0;i<m;++i) {
680  temp=0.0;
681  begin_row=ia[i]; end_row=ia[i+1];
682  for (k=begin_row; k<end_row; ++k) temp+=x[ja[k]];
683  y[i]+=temp*alpha;
684  }
685 #ifdef _OPENMP
686  }
687 #endif
688  }
689 
690 }
691 
705  REAL *x,
706  REAL *y)
707 {
708  register REAL value=0.0;
709  const INT m=A->row;
710  const INT *ia=A->IA, *ja=A->JA;
711  const REAL *aj=A->val;
712  INT i, k, begin_row, end_row;
713  register REAL temp;
714 
715  INT use_openmp = FALSE;
716 
717 #ifdef _OPENMP
718  if ( m > OPENMP_HOLDS ) {
719  use_openmp = TRUE;
720  }
721 #endif
722 
723  if (use_openmp) {
724 #ifdef _OPENMP
725 #pragma omp parallel for reduction(+:value) private(i,temp,begin_row,end_row,k)
726 #endif
727  for (i=0;i<m;++i) {
728  temp=0.0;
729  begin_row=ia[i]; end_row=ia[i+1];
730  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
731  value+=y[i]*temp;
732  }
733  }
734  else {
735  for (i=0;i<m;++i) {
736  temp=0.0;
737  begin_row=ia[i]; end_row=ia[i+1];
738  for (k=begin_row; k<end_row; ++k) temp+=aj[k]*x[ja[k]];
739  value+=y[i]*temp;
740  }
741  }
742  return value;
743 }
744 
760  dCSRmat *B,
761  dCSRmat *C)
762 {
763  INT i,j,k,l,count;
764 
765  INT *JD = (INT *)fasp_mem_calloc(B->col,sizeof(INT));
766 
767  C->row=A->row;
768  C->col=B->col;
769  C->val = NULL;
770  C->JA = NULL;
771  C->IA = (INT*)fasp_mem_calloc(C->row+1,sizeof(INT));
772 
773  for (i=0;i<B->col;++i) JD[i]=-1;
774 
775  // step 1: Find first the structure IA of C
776  for(i=0;i<C->row;++i) {
777  count=0;
778 
779  for (k=A->IA[i];k<A->IA[i+1];++k) {
780  for (j=B->IA[A->JA[k]];j<B->IA[A->JA[k]+1];++j) {
781  for (l=0;l<count;l++) {
782  if (JD[l]==B->JA[j]) break;
783  }
784 
785  if (l==count) {
786  JD[count]=B->JA[j];
787  count++;
788  }
789  }
790  }
791  C->IA[i+1]=count;
792  for (j=0;j<count;++j) {
793  JD[j]=-1;
794  }
795  }
796 
797  for (i=0;i<C->row;++i) C->IA[i+1]+=C->IA[i];
798 
799  // step 2: Find the structure JA of C
800  INT countJD;
801 
802  C->JA=(INT*)fasp_mem_calloc(C->IA[C->row],sizeof(INT));
803 
804  for (i=0;i<C->row;++i) {
805  countJD=0;
806  count=C->IA[i];
807  for (k=A->IA[i];k<A->IA[i+1];++k) {
808  for (j=B->IA[A->JA[k]];j<B->IA[A->JA[k]+1];++j) {
809  for (l=0;l<countJD;l++) {
810  if (JD[l]==B->JA[j]) break;
811  }
812 
813  if (l==countJD) {
814  C->JA[count]=B->JA[j];
815  JD[countJD]=B->JA[j];
816  count++;
817  countJD++;
818  }
819  }
820  }
821 
822  //for (j=0;j<countJD;++j) JD[j]=-1;
823  fasp_iarray_set(countJD, JD, -1);
824  }
825 
826  fasp_mem_free(JD);
827 
828  // step 3: Find the structure A of C
829  C->val=(REAL*)fasp_mem_calloc(C->IA[C->row],sizeof(REAL));
830 
831  for (i=0;i<C->row;++i) {
832  for (j=C->IA[i];j<C->IA[i+1];++j) {
833  C->val[j]=0;
834  for (k=A->IA[i];k<A->IA[i+1];++k) {
835  for (l=B->IA[A->JA[k]];l<B->IA[A->JA[k]+1];l++) {
836  if (B->JA[l]==C->JA[j]) {
837  C->val[j]+=A->val[k]*B->val[l];
838  } // end if
839  } // end for l
840  } // end for k
841  } // end for j
842  } // end for i
843 
844  C->nnz = C->IA[C->row]-C->IA[0];
845 
846 }
847 
867  dCSRmat *A,
868  dCSRmat *P,
869  dCSRmat *RAP)
870 {
871  INT n_coarse = R->row;
872  INT *R_i = R->IA;
873  INT *R_j = R->JA;
874  REAL *R_data = R->val;
875 
876  INT n_fine = A->row;
877  INT *A_i = A->IA;
878  INT *A_j = A->JA;
879  REAL *A_data = A->val;
880 
881  INT *P_i = P->IA;
882  INT *P_j = P->JA;
883  REAL *P_data = P->val;
884 
885  INT RAP_size;
886  INT *RAP_i = NULL;
887  INT *RAP_j = NULL;
888  REAL *RAP_data = NULL;
889 
890 #ifdef _OPENMP
891  INT *P_marker = NULL;
892  INT *A_marker = NULL;
893 #endif
894 
895  INT *Ps_marker = NULL;
896  INT *As_marker = NULL;
897 
898  INT ic, i1, i2, i3, jj1, jj2, jj3;
899  INT jj_counter, jj_row_begining;
900  REAL r_entry, r_a_product, r_a_p_product;
901 
902  INT nthreads = 1;
903 
904 #ifdef _OPENMP
905  INT myid, mybegin, myend, Ctemp;
906  nthreads = FASP_GET_NUM_THREADS();
907 #endif
908 
909  INT coarse_mul_nthreads = n_coarse * nthreads;
910  INT fine_mul_nthreads = n_fine * nthreads;
911  INT coarse_add_nthreads = n_coarse + nthreads;
912  INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
913  INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
914 
915  Ps_marker = (INT *)fasp_mem_calloc(total_calloc, sizeof(INT));
916  As_marker = Ps_marker + coarse_mul_nthreads;
917 
918  /*------------------------------------------------------*
919  * First Pass: Determine size of RAP and set up RAP_i *
920  *------------------------------------------------------*/
921  RAP_i = (INT *)fasp_mem_calloc(n_coarse+1, sizeof(INT));
922 
923  fasp_iarray_set(minus_one_length, Ps_marker, -1);
924 
925 #ifdef _OPENMP
926  INT * RAP_temp = As_marker + fine_mul_nthreads;
927  INT * part_end = RAP_temp + coarse_add_nthreads;
928 
929  if (n_coarse > OPENMP_HOLDS) {
930 #pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
931  jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
932  for (myid = 0; myid < nthreads; myid++) {
933  FASP_GET_START_END(myid, nthreads, n_coarse, &mybegin, &myend);
934  P_marker = Ps_marker + myid * n_coarse;
935  A_marker = As_marker + myid * n_fine;
936  jj_counter = 0;
937  for (ic = mybegin; ic < myend; ic ++) {
938  P_marker[ic] = jj_counter;
939  jj_row_begining = jj_counter;
940  jj_counter ++;
941 
942  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
943  i1 = R_j[jj1];
944  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
945  i2 = A_j[jj2];
946  if (A_marker[i2] != ic) {
947  A_marker[i2] = ic;
948  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
949  i3 = P_j[jj3];
950  if (P_marker[i3] < jj_row_begining) {
951  P_marker[i3] = jj_counter;
952  jj_counter ++;
953  }
954  }
955  }
956  }
957  }
958 
959  RAP_temp[ic+myid] = jj_row_begining;
960  }
961  RAP_temp[myend+myid] = jj_counter;
962 
963  part_end[myid] = myend + myid + 1;
964  }
965  fasp_iarray_cp(part_end[0], RAP_temp, RAP_i);
966  jj_counter = part_end[0];
967  Ctemp = 0;
968  for (i1 = 1; i1 < nthreads; i1 ++) {
969  Ctemp += RAP_temp[part_end[i1-1]-1];
970  for (jj1 = part_end[i1-1]+1; jj1 < part_end[i1]; jj1 ++) {
971  RAP_i[jj_counter] = RAP_temp[jj1] + Ctemp;
972  jj_counter ++;
973  }
974  }
975  RAP_size = RAP_i[n_coarse];
976  }
977 
978  else {
979 #endif
980  jj_counter = 0;
981  for (ic = 0; ic < n_coarse; ic ++) {
982  Ps_marker[ic] = jj_counter;
983  jj_row_begining = jj_counter;
984  jj_counter ++;
985 
986  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
987  i1 = R_j[jj1];
988 
989  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
990  i2 = A_j[jj2];
991  if (As_marker[i2] != ic) {
992  As_marker[i2] = ic;
993  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
994  i3 = P_j[jj3];
995  if (Ps_marker[i3] < jj_row_begining) {
996  Ps_marker[i3] = jj_counter;
997  jj_counter ++;
998  }
999  }
1000  }
1001  }
1002  }
1003 
1004  RAP_i[ic] = jj_row_begining;
1005  }
1006 
1007  RAP_i[n_coarse] = jj_counter;
1008  RAP_size = jj_counter;
1009 #ifdef _OPENMP
1010  }
1011 #endif
1012 
1013  RAP_j = (INT *)fasp_mem_calloc(RAP_size, sizeof(INT));
1014  RAP_data = (REAL *)fasp_mem_calloc(RAP_size, sizeof(REAL));
1015 
1016  fasp_iarray_set(minus_one_length, Ps_marker, -1);
1017 
1018 #ifdef _OPENMP
1019  if (n_coarse > OPENMP_HOLDS) {
1020 #pragma omp parallel for private(myid, mybegin, myend, P_marker, A_marker, jj_counter, \
1021  ic, jj_row_begining, jj1, r_entry, i1, jj2, r_a_product,\
1022  i2, jj3, r_a_p_product, i3)
1023  for (myid = 0; myid < nthreads; myid ++) {
1024  FASP_GET_START_END(myid, nthreads, n_coarse, &mybegin, &myend);
1025  P_marker = Ps_marker + myid * n_coarse;
1026  A_marker = As_marker + myid * n_fine;
1027  jj_counter = RAP_i[mybegin];
1028  for (ic = mybegin; ic < myend; ic ++) {
1029  P_marker[ic] = jj_counter;
1030  jj_row_begining = jj_counter;
1031  RAP_j[jj_counter] = ic;
1032  RAP_data[jj_counter] = 0.0;
1033  jj_counter ++;
1034  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1035  r_entry = R_data[jj1];
1036 
1037  i1 = R_j[jj1];
1038  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1039  r_a_product = r_entry * A_data[jj2];
1040 
1041  i2 = A_j[jj2];
1042  if (A_marker[i2] != ic) {
1043  A_marker[i2] = ic;
1044  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1045  r_a_p_product = r_a_product * P_data[jj3];
1046 
1047  i3 = P_j[jj3];
1048  if (P_marker[i3] < jj_row_begining) {
1049  P_marker[i3] = jj_counter;
1050  RAP_data[jj_counter] = r_a_p_product;
1051  RAP_j[jj_counter] = i3;
1052  jj_counter ++;
1053  }
1054  else {
1055  RAP_data[P_marker[i3]] += r_a_p_product;
1056  }
1057  }
1058  }
1059  else {
1060  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1061  i3 = P_j[jj3];
1062  r_a_p_product = r_a_product * P_data[jj3];
1063  RAP_data[P_marker[i3]] += r_a_p_product;
1064  }
1065  }
1066  }
1067  }
1068  }
1069  }
1070  }
1071  else {
1072 #endif
1073  jj_counter = 0;
1074  for (ic = 0; ic < n_coarse; ic ++) {
1075  Ps_marker[ic] = jj_counter;
1076  jj_row_begining = jj_counter;
1077  RAP_j[jj_counter] = ic;
1078  RAP_data[jj_counter] = 0.0;
1079  jj_counter ++;
1080 
1081  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1082  r_entry = R_data[jj1];
1083 
1084  i1 = R_j[jj1];
1085  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1086  r_a_product = r_entry * A_data[jj2];
1087 
1088  i2 = A_j[jj2];
1089  if (As_marker[i2] != ic) {
1090  As_marker[i2] = ic;
1091  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1092  r_a_p_product = r_a_product * P_data[jj3];
1093 
1094  i3 = P_j[jj3];
1095  if (Ps_marker[i3] < jj_row_begining) {
1096  Ps_marker[i3] = jj_counter;
1097  RAP_data[jj_counter] = r_a_p_product;
1098  RAP_j[jj_counter] = i3;
1099  jj_counter ++;
1100  }
1101  else {
1102  RAP_data[Ps_marker[i3]] += r_a_p_product;
1103  }
1104  }
1105  }
1106  else {
1107  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1108  i3 = P_j[jj3];
1109  r_a_p_product = r_a_product * P_data[jj3];
1110  RAP_data[Ps_marker[i3]] += r_a_p_product;
1111  }
1112  }
1113  }
1114  }
1115  }
1116 #ifdef _OPENMP
1117  }
1118 #endif
1119 
1120  RAP->row = n_coarse;
1121  RAP->col = n_coarse;
1122  RAP->nnz = RAP_size;
1123  RAP->IA = RAP_i;
1124  RAP->JA = RAP_j;
1125  RAP->val = RAP_data;
1126 
1127  fasp_mem_free(Ps_marker);
1128 }
1129 
1149  dCSRmat *A,
1150  dCSRmat *P,
1151  dCSRmat *RAP)
1152 {
1153  INT n_coarse = R->row;
1154  INT *R_i = R->IA;
1155  INT *R_j = R->JA;
1156 
1157  INT n_fine = A->row;
1158  INT *A_i = A->IA;
1159  INT *A_j = A->JA;
1160  REAL *A_data = A->val;
1161 
1162  INT *P_i = P->IA;
1163  INT *P_j = P->JA;
1164 
1165  INT RAP_size;
1166  INT *RAP_i = NULL;
1167  INT *RAP_j = NULL;
1168  REAL *RAP_data = NULL;
1169 
1170 #ifdef _OPENMP
1171  INT *P_marker = NULL;
1172  INT *A_marker = NULL;
1173 #endif
1174 
1175  INT *Ps_marker = NULL;
1176  INT *As_marker = NULL;
1177 
1178  INT ic, i1, i2, i3, jj1, jj2, jj3;
1179  INT jj_counter, jj_row_begining;
1180 
1181  INT nthreads = 1;
1182 
1183 #ifdef _OPENMP
1184  INT myid, mybegin, myend, Ctemp;
1185  nthreads = FASP_GET_NUM_THREADS();
1186 #endif
1187 
1188  INT coarse_mul_nthreads = n_coarse * nthreads;
1189  INT fine_mul_nthreads = n_fine * nthreads;
1190  INT coarse_add_nthreads = n_coarse + nthreads;
1191  INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
1192  INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
1193 
1194  Ps_marker = (INT *)fasp_mem_calloc(total_calloc, sizeof(INT));
1195  As_marker = Ps_marker + coarse_mul_nthreads;
1196 
1197  /*------------------------------------------------------*
1198  * First Pass: Determine size of RAP and set up RAP_i *
1199  *------------------------------------------------------*/
1200  RAP_i = (INT *)fasp_mem_calloc(n_coarse+1, sizeof(INT));
1201 
1202  fasp_iarray_set(minus_one_length, Ps_marker, -1);
1203 
1204 #ifdef _OPENMP
1205  INT * RAP_temp = As_marker + fine_mul_nthreads;
1206  INT * part_end = RAP_temp + coarse_add_nthreads;
1207 
1208  if (n_coarse > OPENMP_HOLDS) {
1209 #pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
1210  jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
1211  for (myid = 0; myid < nthreads; myid++) {
1212  FASP_GET_START_END(myid, nthreads, n_coarse, &mybegin, &myend);
1213  P_marker = Ps_marker + myid * n_coarse;
1214  A_marker = As_marker + myid * n_fine;
1215  jj_counter = 0;
1216  for (ic = mybegin; ic < myend; ic ++) {
1217  P_marker[ic] = jj_counter;
1218  jj_row_begining = jj_counter;
1219  jj_counter ++;
1220 
1221  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1222  i1 = R_j[jj1];
1223  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1224  i2 = A_j[jj2];
1225  if (A_marker[i2] != ic) {
1226  A_marker[i2] = ic;
1227  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1228  i3 = P_j[jj3];
1229  if (P_marker[i3] < jj_row_begining) {
1230  P_marker[i3] = jj_counter;
1231  jj_counter ++;
1232  }
1233  }
1234  }
1235  }
1236  }
1237 
1238  RAP_temp[ic+myid] = jj_row_begining;
1239  }
1240  RAP_temp[myend+myid] = jj_counter;
1241 
1242  part_end[myid] = myend + myid + 1;
1243  }
1244  fasp_iarray_cp(part_end[0], RAP_temp, RAP_i);
1245  jj_counter = part_end[0];
1246  Ctemp = 0;
1247  for (i1 = 1; i1 < nthreads; i1 ++) {
1248  Ctemp += RAP_temp[part_end[i1-1]-1];
1249  for (jj1 = part_end[i1-1]+1; jj1 < part_end[i1]; jj1 ++) {
1250  RAP_i[jj_counter] = RAP_temp[jj1] + Ctemp;
1251  jj_counter ++;
1252  }
1253  }
1254  RAP_size = RAP_i[n_coarse];
1255  }
1256 
1257  else {
1258 #endif
1259  jj_counter = 0;
1260  for (ic = 0; ic < n_coarse; ic ++) {
1261  Ps_marker[ic] = jj_counter;
1262  jj_row_begining = jj_counter;
1263  jj_counter ++;
1264 
1265  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1266  i1 = R_j[jj1];
1267 
1268  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1269  i2 = A_j[jj2];
1270  if (As_marker[i2] != ic) {
1271  As_marker[i2] = ic;
1272  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1273  i3 = P_j[jj3];
1274  if (Ps_marker[i3] < jj_row_begining) {
1275  Ps_marker[i3] = jj_counter;
1276  jj_counter ++;
1277  }
1278  }
1279  }
1280  }
1281  }
1282 
1283  RAP_i[ic] = jj_row_begining;
1284  }
1285 
1286  RAP_i[n_coarse] = jj_counter;
1287  RAP_size = jj_counter;
1288 #ifdef _OPENMP
1289  }
1290 #endif
1291 
1292  RAP_j = (INT *)fasp_mem_calloc(RAP_size, sizeof(INT));
1293  RAP_data = (REAL *)fasp_mem_calloc(RAP_size, sizeof(REAL));
1294 
1295  fasp_iarray_set(minus_one_length, Ps_marker, -1);
1296 
1297 #ifdef _OPENMP
1298  if (n_coarse > OPENMP_HOLDS) {
1299 #pragma omp parallel for private(myid, mybegin, myend, P_marker, A_marker, jj_counter, \
1300  ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
1301  for (myid = 0; myid < nthreads; myid ++) {
1302  FASP_GET_START_END(myid, nthreads, n_coarse, &mybegin, &myend);
1303  P_marker = Ps_marker + myid * n_coarse;
1304  A_marker = As_marker + myid * n_fine;
1305  jj_counter = RAP_i[mybegin];
1306  for (ic = mybegin; ic < myend; ic ++) {
1307  P_marker[ic] = jj_counter;
1308  jj_row_begining = jj_counter;
1309  RAP_j[jj_counter] = ic;
1310  RAP_data[jj_counter] = 0.0;
1311  jj_counter ++;
1312  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1313 
1314  i1 = R_j[jj1];
1315  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1316 
1317  i2 = A_j[jj2];
1318  if (A_marker[i2] != ic) {
1319  A_marker[i2] = ic;
1320  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1321 
1322  i3 = P_j[jj3];
1323  if (P_marker[i3] < jj_row_begining) {
1324  P_marker[i3] = jj_counter;
1325  RAP_data[jj_counter] = A_data[jj2];
1326  RAP_j[jj_counter] = i3;
1327  jj_counter ++;
1328  }
1329  else {
1330  RAP_data[P_marker[i3]] += A_data[jj2];
1331  }
1332  }
1333  }
1334  else {
1335  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1336  i3 = P_j[jj3];
1337  RAP_data[P_marker[i3]] += A_data[jj2];
1338  }
1339  }
1340  }
1341  }
1342  }
1343  }
1344  }
1345  else {
1346 #endif
1347  jj_counter = 0;
1348  for (ic = 0; ic < n_coarse; ic ++) {
1349  Ps_marker[ic] = jj_counter;
1350  jj_row_begining = jj_counter;
1351  RAP_j[jj_counter] = ic;
1352  RAP_data[jj_counter] = 0.0;
1353  jj_counter ++;
1354 
1355  for (jj1 = R_i[ic]; jj1 < R_i[ic+1]; jj1 ++) {
1356  i1 = R_j[jj1];
1357  for (jj2 = A_i[i1]; jj2 < A_i[i1+1]; jj2 ++) {
1358  i2 = A_j[jj2];
1359  if (As_marker[i2] != ic) {
1360  As_marker[i2] = ic;
1361  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1362  i3 = P_j[jj3];
1363  if (Ps_marker[i3] < jj_row_begining) {
1364  Ps_marker[i3] = jj_counter;
1365  RAP_data[jj_counter] = A_data[jj2];
1366  RAP_j[jj_counter] = i3;
1367  jj_counter ++;
1368  }
1369  else {
1370  RAP_data[Ps_marker[i3]] += A_data[jj2];
1371  }
1372  }
1373  }
1374  else {
1375  for (jj3 = P_i[i2]; jj3 < P_i[i2+1]; jj3 ++) {
1376  i3 = P_j[jj3];
1377  RAP_data[Ps_marker[i3]] += A_data[jj2];
1378  }
1379  }
1380  }
1381  }
1382  }
1383 #ifdef _OPENMP
1384  }
1385 #endif
1386 
1387  RAP->row = n_coarse;
1388  RAP->col = n_coarse;
1389  RAP->nnz = RAP_size;
1390  RAP->IA = RAP_i;
1391  RAP->JA = RAP_j;
1392  RAP->val = RAP_data;
1393 
1394  fasp_mem_free(Ps_marker);
1395 }
1396 
1414  dCSRmat *A,
1415  dCSRmat *P,
1416  dCSRmat *B)
1417 {
1418  const INT row=R->row, col=P->col;
1419  INT nB=A->nnz;
1420  INT i,i1,j,jj,k,length;
1421  INT begin_row,end_row,begin_rowA,end_rowA,begin_rowR,end_rowR;
1422  INT istart,iistart,count;
1423 
1424  REAL *aj=A->val, *acj;
1425  INT *ir=R->IA, *ia=A->IA, *ip=P->IA, *iac;
1426  INT *jr=R->JA, *ja=A->JA, *jp=P->JA, *jac;
1427 
1428  INT *index = (INT *)fasp_mem_calloc(A->col,sizeof(INT));
1429 
1430  INT *iindex = (INT *)fasp_mem_calloc(col,sizeof(INT));
1431 
1432  //for (i=0; i<A->col; ++i) index[i] = -2;
1433  fasp_iarray_set(A->col, index, -2);
1434 
1435  //memcpy(iindex,index,col*sizeof(INT));
1436  fasp_iarray_cp(col, index, iindex);
1437 
1438  jac = (INT*)fasp_mem_calloc(nB,sizeof(INT));
1439 
1440  iac = (INT*)fasp_mem_calloc(row+1,sizeof(INT));
1441 
1442  REAL *temp = (REAL*)fasp_mem_calloc(A->col,sizeof(REAL));
1443 
1444  iac[0] = 0;
1445 
1446  // First loop: form sparsity partern of R*A*P
1447  for (i=0; i < row; ++i) {
1448  // reset istart and length at the begining of each loop
1449  istart = -1; length = 0; i1 = i+1;
1450 
1451  // go across the rows in R
1452  begin_rowR=ir[i]; end_rowR=ir[i1];
1453  for (jj=begin_rowR; jj<end_rowR; ++jj) {
1454  j = jr[jj];
1455  // for each column in A
1456  begin_rowA=ia[j]; end_rowA=ia[j+1];
1457  for (k=begin_rowA; k<end_rowA; ++k) {
1458  if (index[ja[k]] == -2) {
1459  index[ja[k]] = istart;
1460  istart = ja[k];
1461  ++length;
1462  }
1463  }
1464  }
1465 
1466  // book-keeping [reseting length and setting iistart]
1467  count = length; iistart = -1; length = 0;
1468 
1469  // use each column that would have resulted from R*A
1470  for (j=0; j < count; ++j) {
1471  jj = istart;
1472  istart = index[istart];
1473  index[jj] = -2;
1474 
1475  // go across the row of P
1476  begin_row=ip[jj]; end_row=ip[jj+1];
1477  for (k=begin_row; k<end_row; ++k) {
1478  // pull out the appropriate columns of P
1479  if (iindex[jp[k]] == -2) {
1480  iindex[jp[k]] = iistart;
1481  iistart = jp[k];
1482  ++length;
1483  }
1484  } // end for k
1485  } // end for j
1486 
1487  // set B->IA
1488  iac[i1]=iac[i]+length;
1489 
1490  if (iac[i1]>nB) {
1491  nB=nB*2;
1492  jac=(INT*)fasp_mem_realloc(jac, nB*sizeof(INT));
1493  }
1494 
1495  // put the correct columns of p into the column list of the products
1496  begin_row=iac[i]; end_row=iac[i1];
1497  for (j=begin_row; j<end_row; ++j) {
1498  // put the value in B->JA
1499  jac[j] = iistart;
1500  // set istart to the next value
1501  iistart = iindex[iistart];
1502  // set the iindex spot to 0
1503  iindex[jac[j]] = -2;
1504  } // end j
1505 
1506  } // end i: First loop
1507 
1508  jac=(INT*)fasp_mem_realloc(jac,(iac[row])*sizeof(INT));
1509 
1510  acj=(REAL*)fasp_mem_calloc(iac[row],sizeof(REAL));
1511 
1512  INT *BTindex=(INT*)fasp_mem_calloc(col,sizeof(INT));
1513 
1514  // Second loop: compute entries of R*A*P
1515  for (i=0; i<row; ++i) {
1516  i1 = i+1;
1517 
1518  // each col of B
1519  begin_row=iac[i]; end_row=iac[i1];
1520  for (j=begin_row; j<end_row; ++j) {
1521  BTindex[jac[j]]=j;
1522  }
1523 
1524  // reset istart and length at the beginning of each loop
1525  istart = -1; length = 0;
1526 
1527  // go across the rows in R
1528  begin_rowR=ir[i]; end_rowR=ir[i1];
1529  for ( jj=begin_rowR; jj<end_rowR; ++jj ) {
1530  j = jr[jj];
1531 
1532  // for each column in A
1533  begin_rowA=ia[j]; end_rowA=ia[j+1];
1534  for (k=begin_rowA; k<end_rowA; ++k) {
1535  if (index[ja[k]] == -2) {
1536  index[ja[k]] = istart;
1537  istart = ja[k];
1538  ++length;
1539  }
1540  temp[ja[k]]+=aj[k];
1541  }
1542  }
1543 
1544  // book-keeping [resetting length and setting iistart]
1545  // use each column that would have resulted from R*A
1546  for (j=0; j<length; ++j) {
1547  jj = istart;
1548  istart = index[istart];
1549  index[jj] = -2;
1550 
1551  // go across the row of P
1552  begin_row=ip[jj]; end_row=ip[jj+1];
1553  for (k=begin_row; k<end_row; ++k) {
1554  // pull out the appropriate columns of P
1555  acj[BTindex[jp[k]]]+=temp[jj];
1556  }
1557  temp[jj]=0.0;
1558  }
1559 
1560  } // end for i: Second loop
1561 
1562  // setup coarse matrix B
1563  B->row=row; B->col=col;
1564  B->IA=iac; B->JA=jac; B->val=acj;
1565  B->nnz=B->IA[B->row]-B->IA[0];
1566 
1567  fasp_mem_free(temp);
1568  fasp_mem_free(index);
1569  fasp_mem_free(iindex);
1570  fasp_mem_free(BTindex);
1571 }
1572 
1597  dCSRmat *A,
1598  dCSRmat *P,
1599  dCSRmat *Ac)
1600 {
1601  const INT nc=Pt->row, n=Pt->col, nnzP=P->nnz, nnzA=A->nnz;
1602  INT i, maxrpout;
1603 
1604  // shift A from usual to ltz format
1605 #ifdef _OPENMP
1606 #pragma omp parallel for if(n>OPENMP_HOLDS)
1607 #endif
1608  for (i=0;i<=n;++i) { A->IA[i]++; P->IA[i]++; }
1609 
1610 #ifdef _OPENMP
1611 #pragma omp parallel for if(nnzA>OPENMP_HOLDS)
1612 #endif
1613  for (i=0;i<nnzA;++i) { A->JA[i]++; }
1614 
1615 #ifdef _OPENMP
1616 #pragma omp parallel for if(nc>OPENMP_HOLDS)
1617 #endif
1618  for (i=0;i<=nc;++i) { Pt->IA[i]++; }
1619 
1620 #ifdef _OPENMP
1621 #pragma omp parallel for if(nnzP>OPENMP_HOLDS)
1622 #endif
1623  for (i=0;i<nnzP;++i) { P->JA[i]++; Pt->JA[i]++; }
1624 
1625  // compute P' A P
1626  dCSRmat PtAP = fasp_blas_dcsr_rap2(Pt->IA,Pt->JA,Pt->val,A->IA,A->JA,A->val,
1627  Pt->IA,Pt->JA,Pt->val,n,nc,&maxrpout,
1628  P->IA,P->JA);
1629 
1630  Ac->row = PtAP.row;
1631  Ac->col = PtAP.col;
1632  Ac->nnz = PtAP.nnz;
1633  Ac->IA = PtAP.IA;
1634  Ac->JA = PtAP.JA;
1635  Ac->val = PtAP.val;
1636 
1637  // shift A back from ltz format
1638 #ifdef _OPENMP
1639 #pragma omp parallel for if(Ac->row>OPENMP_HOLDS)
1640 #endif
1641  for (i=0;i<=Ac->row;++i) Ac->IA[i]--;
1642 
1643 #ifdef _OPENMP
1644 #pragma omp parallel for if(Ac->nnz>OPENMP_HOLDS)
1645 #endif
1646  for (i=0;i< Ac->nnz;++i) Ac->JA[i]--;
1647 
1648 #ifdef _OPENMP
1649 #pragma omp parallel for if(n>OPENMP_HOLDS)
1650 #endif
1651  for (i=0;i<=n;++i) A->IA[i]--;
1652 
1653 #ifdef _OPENMP
1654 #pragma omp parallel for if(nnzA>OPENMP_HOLDS)
1655 #endif
1656  for (i=0;i<nnzA;++i) A->JA[i]--;
1657 
1658 #ifdef _OPENMP
1659 #pragma omp parallel for if(n>OPENMP_HOLDS)
1660 #endif
1661  for (i=0;i<=n;++i) P->IA[i]--;
1662 
1663 #ifdef _OPENMP
1664 #pragma omp parallel for if(nnzP>OPENMP_HOLDS)
1665 #endif
1666  for (i=0;i<nnzP;++i) P->JA[i]--;
1667 
1668 #ifdef _OPENMP
1669 #pragma omp parallel for if(nc>OPENMP_HOLDS)
1670 #endif
1671  for (i=0;i<=nc;++i) Pt->IA[i]--;
1672 
1673 #ifdef _OPENMP
1674 #pragma omp parallel for if(nnzP>OPENMP_HOLDS)
1675 #endif
1676  for (i=0;i<nnzP;++i) Pt->JA[i]--;
1677 
1678  return;
1679 }
1680 
1699  dCSRmat *A,
1700  dCSRmat *P,
1701  dCSRmat *B,
1702  INT *icor_ysk)
1703 {
1704  INT nthreads = 1, use_openmp = FALSE;
1705 
1706 #ifdef _OPENMP
1707  if ( R->row > OPENMP_HOLDS ) {
1708  use_openmp = TRUE;
1709  nthreads = FASP_GET_NUM_THREADS();
1710  }
1711 #endif
1712 
1713  if (use_openmp) {
1714  const INT row=R->row, col=P->col;
1715  INT *ir=R->IA, *ia=A->IA, *ip=P->IA;
1716  INT *jr=R->JA, *ja=A->JA, *jp=P->JA;
1717  REAL *rj=R->val, *aj=A->val, *pj=P->val;
1718  INT istart, iistart;
1719  INT end_row, end_rowA, end_rowR;
1720  INT i, j, jj, k, length, myid, mybegin, myend;
1721  INT jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3;
1722  INT *index = NULL;
1723  INT *iindex = NULL;
1724  INT *BTindex = NULL;
1725  REAL *temp = NULL;
1726  INT FiveMyid, min_A, min_P, A_pos, P_pos, FiveIc;
1727  INT minus_one_length_A = icor_ysk[5*nthreads];
1728  INT minus_one_length_P = icor_ysk[5*nthreads+1];
1729  INT minus_one_length = minus_one_length_A + minus_one_length_P;
1730 
1731  INT *iindexs = (INT *)fasp_mem_calloc(minus_one_length+minus_one_length_P, sizeof(INT));
1732 
1733 #if CHMEM_MODE
1734  total_alloc_mem += minus_one_length*sizeof(INT);
1735 #endif
1736  INT *indexs = iindexs + minus_one_length_P;
1737  INT *BTindexs = indexs + minus_one_length_A;
1738 
1739  INT *iac=(INT*)fasp_mem_calloc(row+1,sizeof(INT));
1740 
1741 #if CHMEM_MODE
1742  total_alloc_mem += (row+1)*sizeof(INT);
1743 #endif
1744 
1745  INT *part_end=(INT*)fasp_mem_calloc(2*nthreads+row,sizeof(INT));
1746 
1747 #if CHMEM_MODE
1748  total_alloc_mem += (2*nthreads+row)*sizeof(INT);
1749 #endif
1750 
1751  INT *iac_temp=part_end + nthreads;
1752  INT **iindex_array = (INT **)fasp_mem_calloc(nthreads, sizeof(INT *));
1753  INT **index_array = (INT **)fasp_mem_calloc(nthreads, sizeof(INT *));
1754 
1755  fasp_iarray_set(minus_one_length, iindexs, -2);
1756 
1757 #ifdef _OPENMP
1758 #pragma omp parallel for private(myid, FiveMyid, mybegin, myend, min_A, min_P, index, \
1759  iindex, A_pos, P_pos, ic, FiveIc, jj_counter, \
1760  jj_row_begining, end_rowR, jj1, i1, end_rowA, jj2, i2,\
1761  end_row, jj3, i3)
1762 #endif
1763  for (myid = 0; myid < nthreads; myid ++) {
1764  FiveMyid = myid * 5;
1765  mybegin = icor_ysk[FiveMyid];
1766  if (myid == nthreads-1) {
1767  myend = row;
1768  }
1769  else {
1770  myend = icor_ysk[FiveMyid+5];
1771  }
1772  min_A = icor_ysk[FiveMyid+2];
1773  min_P = icor_ysk[FiveMyid+4];
1774  A_pos = 0;
1775  P_pos = 0;
1776  for (ic = myid-1; ic >= 0; ic --) {
1777  FiveIc = ic * 5;
1778  A_pos += icor_ysk[FiveIc+1];
1779  P_pos += icor_ysk[FiveIc+3];
1780  }
1781  iindex_array[myid] = iindex= iindexs + P_pos - min_P;
1782  index_array[myid] = index = indexs + A_pos - min_A;
1783  jj_counter = 0;
1784  for (ic = mybegin; ic < myend; ic ++) {
1785  iindex[ic] = jj_counter;
1786  jj_row_begining = jj_counter;
1787  jj_counter ++;
1788  end_rowR = ir[ic+1];
1789  for (jj1 = ir[ic]; jj1 < end_rowR; jj1 ++) {
1790  i1 = jr[jj1];
1791  end_rowA = ia[i1+1];
1792  for (jj2 = ia[i1]; jj2 < end_rowA; jj2 ++) {
1793  i2 = ja[jj2];
1794  if (index[i2] != ic) {
1795  index[i2] = ic;
1796  end_row = ip[i2+1];
1797  for (jj3 = ip[i2]; jj3 < end_row; jj3 ++) {
1798  i3 = jp[jj3];
1799  if (iindex[i3] < jj_row_begining) {
1800  iindex[i3] = jj_counter;
1801  jj_counter ++;
1802  }
1803  }
1804  }
1805  }
1806  }
1807  iac_temp[ic+myid] = jj_row_begining;
1808  }
1809  iac_temp[myend+myid] = jj_counter;
1810  part_end[myid] = myend + myid + 1;
1811  }
1812  fasp_iarray_cp(part_end[0], iac_temp, iac);
1813  jj_counter = part_end[0];
1814  INT Ctemp = 0;
1815  for (i1 = 1; i1 < nthreads; i1 ++) {
1816  Ctemp += iac_temp[part_end[i1-1]-1];
1817  for (jj1 = part_end[i1-1]+1; jj1 < part_end[i1]; jj1 ++) {
1818  iac[jj_counter] = iac_temp[jj1] + Ctemp;
1819  jj_counter ++;
1820  }
1821  }
1822  INT *jac=(INT*)fasp_mem_calloc(iac[row],sizeof(INT));
1823 #if CHMEM_MODE
1824  total_alloc_mem += iac[row]*sizeof(INT);
1825 #endif
1826  fasp_iarray_set(minus_one_length, iindexs, -2);
1827 #ifdef _OPENMP
1828 #pragma omp parallel for private(myid, index, iindex, FiveMyid, mybegin, myend, i, istart,\
1829  length, i1, end_rowR, jj, j, end_rowA, k, iistart, end_row)
1830 #endif
1831  for (myid = 0; myid < nthreads; myid ++) {
1832  iindex = iindex_array[myid];
1833  index = index_array[myid];
1834  FiveMyid = myid * 5;
1835  mybegin = icor_ysk[FiveMyid];
1836  if (myid == nthreads-1) {
1837  myend = row;
1838  }
1839  else {
1840  myend = icor_ysk[FiveMyid+5];
1841  }
1842  for (i = mybegin; i < myend; ++ i) {
1843  istart = -1;
1844  length = 0;
1845  i1 = i+1;
1846  // go across the rows in R
1847  end_rowR = ir[i1];
1848  for (jj = ir[i]; jj < end_rowR; ++ jj) {
1849  j = jr[jj];
1850  // for each column in A
1851  end_rowA = ia[j+1];
1852  for (k = ia[j]; k < end_rowA; ++ k) {
1853  if (index[ja[k]] == -2) {
1854  index[ja[k]] = istart;
1855  istart = ja[k];
1856  ++ length;
1857  }
1858  }
1859  }
1860  // book-keeping [resetting length and setting iistart]
1861  //count = length;
1862  iistart = -1;
1863  //length = 0;
1864  // use each column that would have resulted from R*A
1865  //for (j = 0; j < count; ++ j) {
1866  for (j = 0; j < length; ++ j) {
1867  jj = istart;
1868  istart = index[istart];
1869  index[jj] = -2;
1870  // go across the row of P
1871  end_row = ip[jj+1];
1872  for (k = ip[jj]; k < end_row; ++ k) {
1873  // pull out the appropriate columns of P
1874  if (iindex[jp[k]] == -2) {
1875  iindex[jp[k]] = iistart;
1876  iistart = jp[k];
1877  //++length;
1878  }
1879  } // end for k
1880  } // end for j
1881  // put the correct columns of p into the column list of the products
1882  end_row = iac[i1];
1883  for (j = iac[i]; j < end_row; ++ j) {
1884  // put the value in B->JA
1885  jac[j] = iistart;
1886  // set istart to the next value
1887  iistart = iindex[iistart];
1888  // set the iindex spot to 0
1889  iindex[jac[j]] = -2;
1890  } // end j
1891  }
1892  }
1893  // Third loop: compute entries of R*A*P
1894  REAL *acj=(REAL*)fasp_mem_calloc(iac[row],sizeof(REAL));
1895 #if CHMEM_MODE
1896  total_alloc_mem += iac[row]*sizeof(REAL);
1897 #endif
1898  REAL *temps=(REAL*)fasp_mem_calloc(minus_one_length_A,sizeof(REAL));
1899 #if CHMEM_MODE
1900  total_alloc_mem += minus_one_length_A*sizeof(REAL);
1901 #endif
1902 
1903 #ifdef _OPENMP
1904 #pragma omp parallel for private(myid, index, FiveMyid, mybegin, myend, min_A, min_P, \
1905  A_pos, P_pos, ic, FiveIc, BTindex, temp, i, i1, end_row,\
1906  j, istart, length, end_rowR, jj, end_rowA, k)
1907 #endif
1908  for (myid = 0; myid < nthreads; myid ++) {
1909  index = index_array[myid];
1910  FiveMyid = myid * 5;
1911  mybegin = icor_ysk[FiveMyid];
1912  if (myid == nthreads-1) {
1913  myend = row;
1914  }
1915  else {
1916  myend = icor_ysk[FiveMyid+5];
1917  }
1918  min_A = icor_ysk[FiveMyid+2];
1919  min_P = icor_ysk[FiveMyid+4];
1920  A_pos = 0;
1921  P_pos = 0;
1922  for (ic = myid-1; ic >= 0; ic --) {
1923  FiveIc = ic * 5;
1924  A_pos += icor_ysk[FiveIc+1];
1925  P_pos += icor_ysk[FiveIc+3];
1926  }
1927  BTindex = BTindexs + P_pos - min_P;
1928  temp = temps + A_pos - min_A;
1929  for (i = mybegin; i < myend; ++ i) {
1930  i1 = i+1;
1931  // each col of B
1932  end_row = iac[i1];
1933  for (j = iac[i]; j < end_row; ++ j) {
1934  BTindex[jac[j]] = j;
1935  }
1936  // reset istart and length at the beginning of each loop
1937  istart = -1;
1938  length = 0;
1939  // go across the rows in R
1940  end_rowR = ir[i1];
1941  for (jj = ir[i]; jj < end_rowR; ++ jj) {
1942  j = jr[jj];
1943  // for each column in A
1944  end_rowA = ia[j+1];
1945  for (k = ia[j]; k < end_rowA; ++ k) {
1946  if (index[ja[k]] == -2) {
1947  index[ja[k]] = istart;
1948  istart = ja[k];
1949  ++ length;
1950  }
1951  temp[ja[k]] += rj[jj]*aj[k];
1952  }
1953  }
1954  // book-keeping [resetting length and setting iistart]
1955  // use each column that would have resulted from R*A
1956  for (j = 0; j < length; ++ j) {
1957  jj = istart;
1958  istart = index[istart];
1959  index[jj] = -2;
1960  // go across the row of P
1961  end_row = ip[jj+1];
1962  for (k = ip[jj]; k < end_row; ++ k) {
1963  // pull out the appropriate columns of P
1964  acj[BTindex[jp[k]]]+=temp[jj]*pj[k];
1965  }
1966  temp[jj]=0.0;
1967  }
1968  }
1969  }
1970  // setup coarse matrix B
1971  B->row = row;
1972  B->col = col;
1973  B->IA = iac;
1974  B->JA = jac;
1975  B->val = acj;
1976  B->nnz = B->IA[B->row] - B->IA[0];
1977  fasp_mem_free(temps);
1978  fasp_mem_free(iindexs);
1979  fasp_mem_free(part_end);
1980  fasp_mem_free(iindex_array);
1981  fasp_mem_free(index_array);
1982  }
1983  else {
1984  fasp_blas_dcsr_rap (R, A, P, B);
1985  }
1986 }
1987 
2000  INT *bndwith)
2001 {
2002  INT row = A->row;
2003  INT *ia = A->IA;
2004 
2005  INT i, max;
2006  max = 0;
2007 
2008  for (i=0; i<row; ++i) max = MAX(max, ia[i+1]-ia[i]);
2009 
2010  *bndwith = max;
2011 }
2012 
2013 /*---------------------------------*/
2014 /*-- End of File --*/
2015 /*---------------------------------*/
#define TRUE
Definition of logic type.
Definition: fasp_const.h:67
void fasp_blas_dcsr_rap(dCSRmat *R, dCSRmat *A, dCSRmat *P, dCSRmat *RAP)
Triple sparse matrix multiplication B=R*A*P.
Definition: blas_csr.c:866
void fasp_blas_dcsr_ptap(dCSRmat *Pt, dCSRmat *A, dCSRmat *P, dCSRmat *Ac)
Triple sparse matrix multiplication B=P'*A*P.
Definition: blas_csr.c:1596
INT * JA
integer array of column indexes, the size is nnz
Definition: fasp.h:163
#define REAL
Definition: fasp.h:67
void fasp_dcsr_alloc(const INT m, const INT n, const INT nnz, dCSRmat *A)
Allocate CSR sparse matrix memory space.
Definition: sparse_csr.c:125
dCSRmat fasp_blas_dcsr_rap2(INT *ir, INT *jr, REAL *r, INT *ia, INT *ja, REAL *a, INT *ipt, INT *jpt, REAL *pt, INT n, INT nc, INT *maxrpout, INT *ipin, INT *jpin)
Compute R*A*P.
Definition: rap.c:33
REAL * val
nonzero entries of A
Definition: fasp.h:166
void fasp_blas_dcsr_aAxpy_agg(const REAL alpha, dCSRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y (the entries of A are all ones)
Definition: blas_csr.c:593
void * fasp_mem_calloc(LONGLONG size, INT type)
1M = 1024*1024
Definition: memory.c:60
#define INT
Definition: fasp.h:64
void fasp_blas_dcsr_rap_agg(dCSRmat *R, dCSRmat *A, dCSRmat *P, dCSRmat *RAP)
Triple sparse matrix multiplication B=R*A*P.
Definition: blas_csr.c:1148
#define FASP_SUCCESS
Definition of return status and error messages.
Definition: fasp_const.h:27
void FASP_GET_START_END(INT procid, INT nprocs, INT n, INT *start, INT *end)
Assign Load to each thread.
Definition: threads.c:83
INT fasp_blas_dcsr_add(dCSRmat *A, const REAL alpha, dCSRmat *B, const REAL beta, dCSRmat *C)
compute C = alpha*A + beta*B in CSR format
Definition: blas_csr.c:48
void fasp_mem_free(void *mem)
Free up previous allocated memory body.
Definition: memory.c:150
void fasp_blas_dcsr_axm(dCSRmat *A, const REAL alpha)
Multiply a sparse matrix A in CSR format by a scalar alpha.
Definition: blas_csr.c:201
void fasp_iarray_cp(const INT n, INT *x, INT *y)
Copy an array to the other y=x.
Definition: array.c:185
#define OPENMP_HOLDS
Definition: fasp_const.h:248
INT nnz
number of nonzero entries
Definition: fasp.h:157
INT col
column of matrix A, n
Definition: fasp.h:154
Main header file for FASP.
void * fasp_mem_realloc(void *oldmem, LONGLONG tsize)
Reallocate, initiate, and check memory.
Definition: memory.c:110
INT row
row number of matrix A, m
Definition: fasp.h:151
Sparse matrix of REAL type in CSR format.
Definition: fasp.h:148
#define ERROR_DATA_STRUCTURE
Definition: fasp_const.h:38
INT count
void fasp_blas_dcsr_mxm(dCSRmat *A, dCSRmat *B, dCSRmat *C)
Sparse matrix multiplication C=A*B.
Definition: blas_csr.c:759
unsigned INT total_alloc_mem
Definition: memory.c:34
void fasp_blas_array_ax(const INT n, const REAL a, REAL *x)
x = a*x
Definition: blas_array.c:35
void fasp_blas_dcsr_aAxpy(const REAL alpha, dCSRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y.
Definition: blas_csr.c:479
void fasp_blas_dcsr_rap4(dCSRmat *R, dCSRmat *A, dCSRmat *P, dCSRmat *B, INT *icor_ysk)
Triple sparse matrix multiplication B=R*A*P.
Definition: blas_csr.c:1698
void fasp_blas_dcsr_mxv_agg(dCSRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = A*x, where the entries of A are all ones.
Definition: blas_csr.c:423
#define FALSE
Definition: fasp_const.h:68
INT * IA
integer array of row pointers, the size is m+1
Definition: fasp.h:160
void fasp_blas_dcsr_rap_agg1(dCSRmat *R, dCSRmat *A, dCSRmat *P, dCSRmat *B)
Triple sparse matrix multiplication B=R*A*P (nonzero entries of R and P are ones) ...
Definition: blas_csr.c:1413
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
Definition: array.c:107
void fasp_blas_dcsr_bandwith(dCSRmat *A, INT *bndwith)
Get bandwith of matrix.
Definition: blas_csr.c:1999
#define MAX(a, b)
Definition of max, min, abs.
Definition: fasp.h:72
REAL fasp_blas_dcsr_vmv(dCSRmat *A, REAL *x, REAL *y)
vector-Matrix-vector multiplication alpha = y'*A*x
Definition: blas_csr.c:704
void fasp_blas_dcsr_mxv(dCSRmat *A, REAL *x, REAL *y)
Matrix-vector multiplication y = A*x.
Definition: blas_csr.c:225