SCALE-RM
scale_matrix.F90
Go to the documentation of this file.
1 !-------------------------------------------------------------------------------
9 !-------------------------------------------------------------------------------
10 #ifndef _OPENACC
11 #ifdef USE_CUDALIB
12 #undef USE_CUDALIB
13 #endif
14 #endif
15 
16 #include "scalelib.h"
18  !-----------------------------------------------------------------------------
19  !
20  !++ used modules
21  !
22  use scale_precision
23  use scale_io
24  use scale_prof
25 #ifdef USE_CUDALIB
26  use cusparse
27 #endif
28  !-----------------------------------------------------------------------------
29  implicit none
30  private
31  !-----------------------------------------------------------------------------
32  !
33  !++ Public procedure
34  !
35  public :: matrix_setup
36  public :: matrix_finalize
37 
38  public :: matrix_solver_tridiagonal
42 
43  interface matrix_solver_tridiagonal
44 #ifdef _OPENACC
45  module procedure matrix_solver_tridiagonal_1d_cr
46 #else
47  module procedure matrix_solver_tridiagonal_1d_ta
48 #endif
49  module procedure matrix_solver_tridiagonal_2d
50  module procedure matrix_solver_tridiagonal_2d_block
51  module procedure matrix_solver_tridiagonal_3d
52  end interface matrix_solver_tridiagonal
53 
55 
56  !-----------------------------------------------------------------------------
57  !
58  !++ Public parameters & variables
59  !
60  !-----------------------------------------------------------------------------
61  !
62  !++ Private procedure
63  !
64  !-----------------------------------------------------------------------------
65  !
66  !++ Private parameters & variables
67  !
68 #ifdef USE_CUDALIB
69  type(cusparseHandle) :: handle
70  real(RP), allocatable :: pbuffer(:)
71  !$acc declare device_resident(pbuffer)
72  integer(8) :: bufsize
73  integer :: status
74 #endif
75  !-----------------------------------------------------------------------------
76 contains
77  !-----------------------------------------------------------------------------
79  subroutine matrix_setup
80  use scale_prc, only: &
81  prc_abort
82  implicit none
83 
84 ! namelist /PARAM_MATRIX/ &
85 
86  integer :: ierr
87  !---------------------------------------------------------------------------
88 
89  log_newline
90  log_info("MATRIX_setup",*) 'Setup'
91 
92  !--- read namelist
93 !!$ rewind(IO_FID_CONF)
94 !!$ read(IO_FID_CONF,nml=PARAM_MATRIX,iostat=ierr)
95 !!$ if( ierr < 0 ) then !--- missing
96 !!$ LOG_INFO("MATRIX_setup",*) 'Not found namelist. Default used.'
97 !!$ elseif( ierr > 0 ) then !--- fatal error
98 !!$ LOG_ERROR("MATRIX_setup",*) 'Not appropriate names in namelist PARAM_MATRIX. Check!'
99 !!$ call PRC_abort
100 !!$ endif
101 !!$ LOG_NML(PARAM_MATRIX)
102 
103 #ifdef USE_CUDALIB
104  status = cusparsecreate(handle)
105  if ( status /= cusparse_status_success ) then
106  log_error("MATRIX_setup",*) "cusparseCreate failed: ", status
107  call prc_abort
108  end if
109  bufsize = -1
110 #endif
111  return
112  end subroutine matrix_setup
113 
114  !-----------------------------------------------------------------------------
116  subroutine matrix_finalize
117 
118 #ifdef USE_CUDALIB
119  status = cusparsedestroy(handle)
120  if ( allocated(pbuffer) ) deallocate(pbuffer)
121  bufsize = -1
122 #endif
123 
124  return
125  end subroutine matrix_finalize
126 
127  !-----------------------------------------------------------------------------
129 !OCL SERIAL
130  subroutine matrix_solver_tridiagonal_1d_ta( &
131  KA, KS, KE, &
132 #ifdef _OPENACC
133  work, &
134 #endif
135  ud, md, ld, &
136  iv, &
137  ov )
138  !$acc routine seq
139  implicit none
140  integer, intent(in) :: ka, ks, ke
141 
142  real(rp), intent(in) :: ud(ka) ! upper diagonal
143  real(rp), intent(in) :: md(ka) ! middle diagonal
144  real(rp), intent(in) :: ld(ka) ! lower diagonal
145  real(rp), intent(in) :: iv(ka) ! input vector
146 
147  real(rp), intent(out) :: ov(ka) ! output vector
148 
149 #ifdef _OPENACC
150  real(rp), intent(out) :: work(ks:ke,2)
151 #define c_ta(k) work(k,1)
152 #define d_ta(k) work(k,2)
153 #else
154  real(rp) :: c_ta(ks:ke)
155  real(rp) :: d_ta(ks:ke)
156 #endif
157  real(rp) :: rdenom
158 
159  integer :: k
160  !---------------------------------------------------------------------------
161 
162  ! foward reduction
163  c_ta(ks) = ud(ks) / md(ks)
164  d_ta(ks) = iv(ks) / md(ks)
165  do k = ks+1, ke-1
166  rdenom = 1.0_rp / ( md(k) - ld(k) * c_ta(k-1) )
167  c_ta(k) = ud(k) * rdenom
168  d_ta(k) = ( iv(k) - ld(k) * d_ta(k-1) ) * rdenom
169  enddo
170 
171  ! backward substitution
172  ov(ke) = ( iv(ke) - ld(ke) * d_ta(ke-1) ) / ( md(ke) - ld(ke) * c_ta(ke-1) )
173  do k = ke-1, ks, -1
174  ov(k) = d_ta(k) - c_ta(k) * ov(k+1)
175  enddo
176 
177  return
178  end subroutine matrix_solver_tridiagonal_1d_ta
179 
180  !-----------------------------------------------------------------------------
182 !OCL SERIAL
183  subroutine matrix_solver_tridiagonal_1d_cr( &
184  KA, KS, KE, &
185 #ifdef _OPENACC
186  work, &
187 #endif
188  ud, md, ld, &
189  iv, &
190  ov )
191  !$acc routine vector
192  implicit none
193  integer, intent(in) :: ka, ks, ke
194 
195  real(rp), intent(in) :: ud(ka) ! upper diagonal
196  real(rp), intent(in) :: md(ka) ! middle diagonal
197  real(rp), intent(in) :: ld(ka) ! lower diagonal
198  real(rp), intent(in) :: iv(ka) ! input vector
199 
200  real(rp), intent(out) :: ov(ka) ! output vector
201 
202 #ifdef _OPENACC
203  real(rp), intent(out) :: work(ke-ks+1,4)
204 #define a1_cr(k) work(k,1)
205 #define b1_cr(k) work(k,2)
206 #define c1_cr(k) work(k,3)
207 #define x1_cr(k) work(k,4)
208 #else
209  real(rp) :: a1_cr(ke-ks+1)
210  real(rp) :: b1_cr(ke-ks+1)
211  real(rp) :: c1_cr(ke-ks+1)
212  real(rp) :: x1_cr(ke-ks+1)
213 #endif
214  real(rp) :: f1, f2
215  integer :: st
216  integer :: lmax, kmax
217  integer :: k, k1, k2, l
218 
219  kmax = ke - ks + 1
220  lmax = floor( log(real(kmax,rp)) / log(2.0_rp) ) - 1
221 
222  a1_cr(:) = ld(ks:ke)
223  b1_cr(:) = md(ks:ke)
224  c1_cr(:) = ud(ks:ke)
225  x1_cr(:) = iv(ks:ke)
226 
227  st = 1
228  do l = 1, lmax
229  !$omp parallel do private(k1,k2,f1,f2)
230  !$acc loop private(k1,k2,f1,f2)
231  do k = st*2, kmax, st*2
232  k1 = k - st
233  k2 = k + st
234  f1 = a1_cr(k) / b1_cr(k1)
235  if ( k2 > kmax ) then
236  k2 = k1 ! dummy
237  f2 = 0.0_rp
238  else
239  f2 = c1_cr(k) / b1_cr(k2)
240  end if
241  a1_cr(k) = - a1_cr(k1) * f1
242  c1_cr(k) = - c1_cr(k2) * f2
243  b1_cr(k) = b1_cr(k) - c1_cr(k1) * f1 - a1_cr(k2) * f2
244  x1_cr(k) = x1_cr(k) - x1_cr(k1) * f1 - x1_cr(k2) * f2
245  end do
246  st = st * 2
247  end do
248  if ( kmax / st == 2 ) then
249  ov(ks+st*2-1) = ( a1_cr(st*2) * x1_cr(st) - b1_cr(st) * x1_cr(st*2) ) &
250  / ( a1_cr(st*2) * c1_cr(st) - b1_cr(st) * b1_cr(st*2) )
251  ov(ks+st-1) = ( x1_cr(st) - c1_cr(st) * ov(ks+st*2-1) ) / b1_cr(st)
252  else if ( kmax / st == 3 ) then
253  k = st * 2
254  k1 = st * 2 - st
255  k2 = st * 2 + st
256  f2 = c1_cr(k1) / b1_cr(k)
257  c1_cr(k1) = - c1_cr(k) * f2
258  b1_cr(k1) = b1_cr(k1) - a1_cr(k) * f2
259  x1_cr(k1) = x1_cr(k1) - x1_cr(k) * f2
260 
261  f1 = a1_cr(k2) / b1_cr(k)
262  a1_cr(k2) = - a1_cr(k) * f1
263  b1_cr(k2) = b1_cr(k2) - c1_cr(k) * f1
264  x1_cr(k2) = x1_cr(k2) - x1_cr(k) * f1
265 
266  ov(ks+k2-1) = ( a1_cr(k2) * x1_cr(k1) - b1_cr(k1) * x1_cr(k2) ) &
267  / ( a1_cr(k2) * c1_cr(k1) - b1_cr(k1) * b1_cr(k2) )
268  ov(ks+k1-1) = ( x1_cr(k1) - c1_cr(k1) * ov(ks+k2-1) ) / b1_cr(k1)
269  ov(ks+k-1) = ( x1_cr(k) - a1_cr(k) * ov(ks+k1-1) - c1_cr(k) * ov(ks+k2-1) ) / b1_cr(k)
270  end if
271 
272  do l = 1, lmax
273  st = st / 2
274  !$omp parallel do
275  !$acc loop independent
276  do k = st, kmax, st*2
277  if ( k-st < 1 ) then
278  ov(ks+k-1) = ( x1_cr(k) - c1_cr(k) * ov(ks+k+st-1) ) / b1_cr(k)
279  elseif ( k+st <= kmax ) then
280  ov(ks+k-1) = ( x1_cr(k) - a1_cr(k) * ov(ks+k-st-1) - c1_cr(k) * ov(ks+k+st-1) ) / b1_cr(k)
281  else
282  ov(ks+k-1) = ( x1_cr(k) - a1_cr(k) * ov(ks+k-st-1) ) / b1_cr(k)
283  end if
284  end do
285  end do
286 
287  return
288  end subroutine matrix_solver_tridiagonal_1d_cr
289 
290  !-----------------------------------------------------------------------------
292 !OCL SERIAL
294  KA, KS, KE, &
295 #ifdef _OPENACC
296  work, &
297 #endif
298  ud, md, ld, &
299  iv, &
300  ov )
301  !$acc routine vector
302  implicit none
303  integer, intent(in) :: ka, ks, ke
304 
305  real(rp), intent(in) :: ud(ka) ! upper diagonal
306  real(rp), intent(in) :: md(ka) ! middle diagonal
307  real(rp), intent(in) :: ld(ka) ! lower diagonal
308  real(rp), intent(in) :: iv(ka) ! input vector
309 
310  real(rp), intent(out) :: ov(ka) ! output vector
311 
312 #ifdef _OPENACC
313  real(rp), intent(out) :: work(ke-ks+1,2,4)
314 #define a1_pcr(k,n) work(k,n,1)
315 #define b1_pcr(k,n) work(k,n,2)
316 #define c1_pcr(k,n) work(k,n,3)
317 #define x1_pcr(k,n) work(k,n,4)
318 #else
319  real(rp) :: a1_pcr(ke-ks+1,2)
320  real(rp) :: b1_pcr(ke-ks+1,2)
321  real(rp) :: c1_pcr(ke-ks+1,2)
322  real(rp) :: x1_pcr(ke-ks+1,2)
323 #endif
324  real(rp) :: f1, f2
325  integer :: st
326  integer :: lmax, kmax
327  integer :: iw1, iw2, iws
328  integer :: k, k1, k2, l
329 
330  kmax = ke - ks + 1
331  lmax = ceiling( log(real(kmax,rp)) / log(2.0_rp) )
332 
333  a1_pcr(:,1) = ld(ks:ke)
334  b1_pcr(:,1) = md(ks:ke)
335  c1_pcr(:,1) = ud(ks:ke)
336  x1_pcr(:,1) = iv(ks:ke)
337 
338  st = 1
339  iw1 = 1
340  iw2 = 2
341  do l = 1, lmax
342  !$omp parallel do private(k1,k2,f1,f2)
343  !$acc loop private(k1,k2,f1,f2)
344  do k = 1, kmax
345  k1 = k - st
346  k2 = k + st
347  if ( k1 < 1 ) then
348  k1 = k ! dummy
349  f1 = 0.0_rp
350  else
351  f1 = a1_pcr(k,iw1) / b1_pcr(k1,iw1)
352  end if
353  if ( k2 > kmax ) then
354  k2 = k ! dummy
355  f2 = 0.0_rp
356  else
357  f2 = c1_pcr(k,iw1) / b1_pcr(k2,iw1)
358  end if
359  a1_pcr(k,iw2) = - a1_pcr(k1,iw1) * f1
360  c1_pcr(k,iw2) = - c1_pcr(k2,iw1) * f2
361  b1_pcr(k,iw2) = b1_pcr(k,iw1) - c1_pcr(k1,iw1) * f1 - a1_pcr(k2,iw1) * f2
362  x1_pcr(k,iw2) = x1_pcr(k,iw1) - x1_pcr(k1,iw1) * f1 - x1_pcr(k2,iw1) * f2
363  end do
364  st = st * 2
365  iws = iw2
366  iw2 = iw1
367  iw1 = iws
368  end do
369 
370  !$omp parallel do
371  do k = 1, kmax
372  ov(ks+k-1) = x1_pcr(k,iw1) / b1_pcr(k,iw1)
373  end do
374 
375  return
376  end subroutine matrix_solver_tridiagonal_1d_pcr
377 
378 !OCL SERIAL
379  subroutine matrix_solver_tridiagonal_2d( &
380  KA, KS, KE, &
381  IA, IS, IE, &
382  ud, md, ld, &
383  iv, &
384  ov )
385  use scale_prc, only: &
386  prc_abort
387  implicit none
388  integer, intent(in) :: ka, ks, ke
389  integer, intent(in) :: ia, is, ie
390 
391  real(rp), intent(in) :: ud(ka,ia) ! upper diagonal
392  real(rp), intent(in) :: md(ka,ia) ! middle diagonal
393  real(rp), intent(in) :: ld(ka,ia) ! lower diagonal
394  real(rp), intent(in) :: iv(ka,ia) ! input vector
395 
396  real(rp), intent(out) :: ov(ka,ia) ! output vector
397 
398 #ifdef USE_CUDALIB
399  integer(8) :: bsize
400 #elif defined(_OPENACC)
401  real(rp) :: work(ks:ke,4) ! for CR
402 #else
403  real(rp) :: c(lsize,ks:ke)
404  real(rp) :: d(lsize,ks:ke)
405  real(rp) :: w(lsize,ks:ke)
406  real(rp) :: rdenom
407 #endif
408 
409  integer :: k, i, ii, l
410  !---------------------------------------------------------------------------
411 
412  !$acc data copyin(ud,md,ld,iv) copyout(ov)
413 
414 #ifdef USE_CUDALIB
415  !$acc host_data use_device(ud,md,ld,iv)
416 #ifdef SINGLE
417  status = cusparsesgtsv2stridedbatch_buffersizeext( &
418 #else
419  status = cusparsedgtsv2stridedbatch_buffersizeext( &
420 #endif
421  handle, &
422  ke-ks+1, &
423  ld(ks,is), md(ks,is), ud(ks,is), &
424  iv(ks,is), &
425  ie-is+1, ka, &
426  bsize )
427  !$acc end host_data
428  if ( status /= cusparse_status_success ) then
429  log_error("MATRIX_SOLVER_tridiagonal_2D",*) "cusparseDgtsv2StridedBatch_bufferSizeExt failed: ", status
430  call prc_abort
431  end if
432  if ( bsize > bufsize ) then
433  if ( allocated(pbuffer) ) deallocate( pbuffer )
434  allocate( pbuffer(bsize/rp) )
435  bufsize = bsize
436  end if
437 
438  !$acc kernels
439  ov(:,is:ie) = iv(:,is:ie)
440  !$acc end kernels
441 
442  !$acc host_data use_device(ud,md,ld,ov,pbuffer)
443 #ifdef SINGLE
444  status = cusparsesgtsv2stridedbatch( &
445 #else
446  status = cusparsedgtsv2stridedbatch( &
447 #endif
448  handle, &
449  ke-ks+1, &
450  ld(ks,is), md(ks,is), ud(ks,is), & ! (in)
451  ov(ks,is), & ! (in,out)
452  ie-is+1, ka, &
453  pbuffer )
454  !$acc end host_data
455  if ( status /= cusparse_status_success ) then
456  log_error("MATRIX_SOLVER_tridiagonal_2D",*) "cusparseDgtsv2StridedBatch failed: ", status
457  call prc_abort
458  end if
459 
460 #elif defined(_OPENACC)
461 
462  !$acc kernels
463  !$acc loop independent private(work)
464  do i = is, ie
465  call matrix_solver_tridiagonal_1d_cr( ka, ks, ke, &
466  work(:,:), &
467  ud(:,i), md(:,i), ld(:,i), &
468  iv(:,i), &
469  ov(:,i) )
470  end do
471  !$acc end kernels
472 
473 #else
474 
475  do ii = is, ie, lsize
476 
477  ! foward reduction
478  do l = 1, lsize
479  i = ii + l - 1
480  if ( i <= ie ) then
481  c(l,ks) = ud(ks,i) / md(ks,i)
482  d(l,ks) = iv(ks,i) / md(ks,i)
483  end if
484  end do
485  do k = ks+1, ke-1
486  do l = 1, lsize
487  i = ii + l - 1
488  if ( i <= ie ) then
489  rdenom = 1.0_rp / ( md(k,i) - ld(k,i) * c(l,k-1) )
490  c(l,k) = ud(k,i) * rdenom
491  d(l,k) = ( iv(k,i) - ld(k,i) * d(l,k-1) ) * rdenom
492  end if
493  end do
494  end do
495 
496  ! backward substitution
497  do l = 1, lsize
498  i = ii + l - 1
499  if ( i <= ie ) then
500  w(l,ke) = ( iv(ke,i) - ld(ke,i) * d(l,ke-1) ) / ( md(ke,i) - ld(ke,i) * c(l,ke-1) )
501  end if
502  end do
503  do k = ke-1, ks, -1
504  do l = 1, lsize
505  i = ii + l - 1
506  if ( i <= ie ) then
507  w(l,k) = d(l,k) - c(l,k) * w(l,k+1)
508  end if
509  end do
510  enddo
511 
512  do l = 1, lsize
513  i = ii + l - 1
514  if ( i <= ie ) then
515  do k = ks, ke
516  ov(k,i) = w(l,k)
517  end do
518  end if
519  end do
520 
521  end do
522 
523 #endif
524 
525  !$acc end data
526 
527  return
528  end subroutine matrix_solver_tridiagonal_2d
529 
530 !OCL SERIAL
532  KA, KS, KE, &
533  ud, md, ld, &
534  iv, &
535  ov )
536  !$acc routine vector
537  implicit none
538  integer, intent(in) :: KA, KS, KE
539 
540  real(RP), intent(in) :: ud(KA,LSIZE) ! upper diagonal
541  real(RP), intent(in) :: md(KA,LSIZE) ! middle diagonal
542  real(RP), intent(in) :: ld(KA,LSIZE) ! lower diagonal
543  real(RP), intent(in) :: iv(KA,LSIZE) ! input vector
544 
545  real(RP), intent(out) :: ov(KA,LSIZE) ! output vector
546 
547  real(RP) :: c(LSIZE,KS:KE)
548  real(RP) :: d(LSIZE,KS:KE)
549  real(RP) :: work(LSIZE,KS:KE)
550  real(RP) :: rdenom
551 
552  integer :: k, l
553  !---------------------------------------------------------------------------
554 
555  ! foward reduction
556  do l = 1, lsize
557  c(l,ks) = ud(ks,l) / md(ks,l)
558  d(l,ks) = iv(ks,l) / md(ks,l)
559  end do
560  do k = ks+1, ke-1
561 !OCL NOFULLUNROLL_PRE_SIMD
562  do l = 1, lsize
563  rdenom = 1.0_rp / ( md(k,l) - ld(k,l) * c(l,k-1) )
564  c(l,k) = ud(k,l) * rdenom
565  d(l,k) = ( iv(k,l) - ld(k,l) * d(l,k-1) ) * rdenom
566  end do
567  end do
568 
569  ! backward substitution
570  do l = 1, lsize
571  work(l,ke) = ( iv(ke,l) - ld(ke,l) * d(l,ke-1) ) / ( md(ke,l) - ld(ke,l) * c(l,ke-1) )
572  end do
573  do k = ke-1, ks, -1
574 !OCL NOFULLUNROLL_PRE_SIMD
575  do l = 1, lsize
576  work(l,k) = d(l,k) - c(l,k) * work(l,k+1)
577  end do
578  end do
579 
580  do l = 1, lsize
581  do k = ks, ke
582  ov(k,l) = work(l,k)
583  end do
584  end do
585 
586  return
588 
589 !OCL SERIAL
590  subroutine matrix_solver_tridiagonal_2d_trans( &
591  KA, KS, KE, &
592  ud, md, ld, &
593  iv, &
594  ov )
595  !$acc routine vector
596  implicit none
597  integer, intent(in) :: KA, KS, KE
598 
599  real(RP), intent(in) :: ud(LSIZE,KA) ! upper diagonal
600  real(RP), intent(in) :: md(LSIZE,KA) ! middle diagonal
601  real(RP), intent(in) :: ld(LSIZE,KA) ! lower diagonal
602  real(RP), intent(in) :: iv(LSIZE,KA) ! input vector
603 
604  real(RP), intent(out) :: ov(LSIZE,KA) ! output vector
605 
606  real(RP) :: c(LSIZE,KS:KE)
607  real(RP) :: d(LSIZE,KS:KE)
608  real(RP) :: rdenom
609 
610  integer :: k, l
611  !---------------------------------------------------------------------------
612 
613  ! foward reduction
614  do l = 1, lsize
615  c(l,ks) = ud(l,ks) / md(l,ks)
616  d(l,ks) = iv(l,ks) / md(l,ks)
617  end do
618  do k = ks+1, ke-1
619 !OCL NOFULLUNROLL_PRE_SIMD
620  do l = 1, lsize
621  rdenom = 1.0_rp / ( md(l,k) - ld(l,k) * c(l,k-1) )
622  c(l,k) = ud(l,k) * rdenom
623  d(l,k) = ( iv(l,k) - ld(l,k) * d(l,k-1) ) * rdenom
624  end do
625  end do
626 
627  ! backward substitution
628  do l = 1, lsize
629  ov(l,ke) = ( iv(l,ke) - ld(l,ke) * d(l,ke-1) ) / ( md(l,ke) - ld(l,ke) * c(l,ke-1) )
630  end do
631  do k = ke-1, ks, -1
632 !OCL NOFULLUNROLL_PRE_SIMD
633  do l = 1, lsize
634  ov(l,k) = d(l,k) - c(l,k) * ov(l,k+1)
635  end do
636  end do
637 
638  return
639  end subroutine matrix_solver_tridiagonal_2d_trans
640 
641  !-----------------------------------------------------------------------------
642  subroutine matrix_solver_tridiagonal_3d( &
643  KA, KS, KE, &
644  IA, IS, IE, &
645  JA, JS, JE, &
646  ud, &
647  md, &
648  ld, &
649  iv, &
650  ov, &
651  mask )
652  use scale_prc, only: &
653  prc_abort
654  implicit none
655 
656  integer, intent(in) :: KA, KS, KE ! array size
657  integer, intent(in) :: IA, IS, IE ! array size
658  integer, intent(in) :: JA, JS, JE ! array size
659  real(RP), intent(in) :: ud(KA,IA,JA) ! upper diagonal
660  real(RP), intent(in) :: md(KA,IA,JA) ! middle diagonal
661  real(RP), intent(in) :: ld(KA,IA,JA) ! lower diagonal
662  real(RP), intent(in) :: iv(KA,IA,JA) ! input vector
663  real(RP), intent(out), target :: ov(KA,IA,JA) ! output vector
664 
665  logical, intent(in), optional :: mask(IA,JA)
666 
667 #ifdef USE_CUDALIB
668  real(RP), pointer :: ovl(:,:,:)
669  real(RP), target :: buf(KA,IA,JA)
670  integer(8) :: bsize
671 #elif defined(_OPENACC)
672  real(RP) :: work(KS:KE,4) ! for CR
673 #else
674  real(RP) :: udl(LSIZE,KA)
675  real(RP) :: mdl(LSIZE,KA)
676  real(RP) :: ldl(LSIZE,KA)
677  real(RP) :: ivl(LSIZE,KA)
678  real(RP) :: ovl(LSIZE,KA)
679  integer :: idx(LSIZE)
680  integer :: len
681 #endif
682 
683  integer :: i, j, k, l
684  !---------------------------------------------------------------------------
685 
686 
687  !$acc data copyin(ud,md,ld,iv) copyout(ov)
688  !$acc data copyin(mask) if( present(mask) )
689 
690 #ifdef USE_CUDALIB
691  !$acc host_data use_device(ud,md,ld,iv)
692 #ifdef SINGLE
693  status = cusparsesgtsv2stridedbatch_buffersizeext( &
694 #else
695  status = cusparsedgtsv2stridedbatch_buffersizeext( &
696 #endif
697  handle, &
698  ke-ks+1, &
699  ld(ks,1,1), md(ks,1,1), ud(ks,1,1), &
700  iv(ks,1,1), &
701  ia*ja, ka, &
702  bsize )
703  !$acc end host_data
704  if ( status /= cusparse_status_success ) then
705  log_error("MATRIX_SOLVER_tridiagonal_3D",*) "cusparseDgtsv2StridedBatch_bufferSizeExt failed: ", status
706  call prc_abort
707  end if
708  if ( bsize > bufsize ) then
709  if ( bufsize > 0 ) deallocate( pbuffer )
710  allocate( pbuffer(bsize/rp) )
711  bufsize = bsize
712  end if
713 
714  if ( present(mask) ) then
715  !$acc enter data create(buf)
716  ovl => buf
717  else
718  ovl => ov
719  end if
720  !$acc kernels
721  ovl(:,:,:) = iv(:,:,:)
722  !$acc end kernels
723  !$acc host_data use_device(ud,md,ld,ovl,pbuffer)
724 #ifdef SINGLE
725  status = cusparsesgtsv2stridedbatch( &
726 #else
727  status = cusparsedgtsv2stridedbatch( &
728 #endif
729  handle, &
730  ke-ks+1, &
731  ld(ks,1,1), md(ks,1,1), ud(ks,1,1), & ! (in)
732  ovl(ks,1,1), & ! (in,out)
733  ia*ja, ka, &
734  pbuffer )
735  !$acc end host_data
736  if ( status /= cusparse_status_success ) then
737  log_error("MATRIX_SOLVER_tridiagonal_3D",*) "cusparseDgtsv2StridedBatch failed: ", status
738  call prc_abort
739  end if
740 
741  if ( present(mask) ) then
742  !$acc kernels
743  !$acc loop independent collapse(3)
744  do j = js, je
745  do i = is, ie
746  do k = ks, ke
747  if ( mask(i,j) ) then
748  ov(k,i,j) = ovl(k,i,j)
749  end if
750  end do
751  end do
752  end do
753  !$acc end kernels
754  !$acc exit data delete(buf)
755  end if
756 
757 #elif defined(_OPENACC)
758 
759  if ( present(mask) ) then
760  !$acc kernels
761  !$acc loop independent
762  do j = js, je
763  !$acc loop independent private(work)
764  do i = is, ie
765  if ( mask(i,j) ) then
766  call matrix_solver_tridiagonal_1d_cr( ka, ks, ke, &
767  work(:,:), &
768  ud(:,i,j), md(:,i,j), ld(:,i,j), &
769  iv(:,i,j), &
770  ov(:,i,j) )
771  end if
772  end do
773  end do
774  !$acc end kernels
775  else
776  !$acc kernels
777  !$acc loop independent
778  do j = js, je
779  !$acc loop independent private(work)
780  do i = is, ie
781  call matrix_solver_tridiagonal_1d_cr( ka, ks, ke, &
782  work(:,:), &
783  ud(:,i,j), md(:,i,j), ld(:,i,j), &
784  iv(:,i,j), &
785  ov(:,i,j) )
786  end do
787  end do
788  !$acc end kernels
789  end if
790 
791 #else
792 
793  if ( present(mask) ) then
794  !$omp parallel do schedule(dynamic) &
795  !$omp private(len,udl,mdl,ldl,ivl,ovl,idx)
796  do j = js, je
797  len = 0
798  do i = is, ie
799  if ( mask(i,j) ) then
800  len = len + 1
801  idx(len) = i
802  if ( len == lsize ) then
803  do k = ks, ke
804 !OCL NOFULLUNROLL_PRE_SIMD
805  do l = 1, lsize
806  udl(l,k) = ud(k,idx(l),j)
807  mdl(l,k) = md(k,idx(l),j)
808  ldl(l,k) = ld(k,idx(l),j)
809  ivl(l,k) = iv(k,idx(l),j)
810  end do
811  end do
812  call matrix_solver_tridiagonal_2d_trans( ka, ks, ke, &
813  udl(:,:), mdl(:,:), ldl(:,:), & ! (in)
814  ivl(:,:), & ! (in)
815  ovl(:,:) ) ! (out)
816 !OCL NORECURRENCE
817  do l = 1, lsize
818  do k = ks, ke
819  ov(k,idx(l),j) = ovl(l,k)
820  end do
821  end do
822  len = 0
823  end if
824  end if
825  end do
826  if ( len > 0 ) then
827  do k = ks, ke
828 !OCL NOFULLUNROLL_PRE_SIMD
829  do l = 1, len
830  udl(l,k) = ud(k,idx(l),j)
831  mdl(l,k) = md(k,idx(l),j)
832  ldl(l,k) = ld(k,idx(l),j)
833  ivl(l,k) = iv(k,idx(l),j)
834  end do
835 #if defined DEBUG || defined QUICKDEBUG
836  do l = len+1, lsize
837  udl(l,k) = 0.0_rp
838  mdl(l,k) = 1.0_rp
839  ldl(l,k) = 0.0_rp
840  ivl(l,k) = 0.0_rp
841  end do
842 #endif
843  end do
844  call matrix_solver_tridiagonal_2d_trans( ka, ks, ke, &
845  udl(:,:), mdl(:,:), ldl(:,:), & ! (in)
846  ivl(:,:), & ! (in)
847  ovl(:,:) ) ! (out)
848 !OCL NORECURRENCE
849  do l = 1, len
850  do k = ks, ke
851  ov(k,idx(l),j) = ovl(l,k)
852  end do
853  end do
854  end if
855  end do
856  else
857  !$omp parallel do default(none) OMP_SCHEDULE_ &
858  !$omp shared(JS,JE,IA,IS,IE,KA,KS,KE,ud,md,ld,iv,ov) &
859  !$omp private(j)
860  do j = js, je
861  call matrix_solver_tridiagonal_2d( ka, ks, ke, ia, is, ie, &
862  ud(:,:,j), md(:,:,j), ld(:,:,j), & ! (in)
863  iv(:,:,j), & ! (in)
864  ov(:,:,j) ) ! (out)
865  enddo
866  end if
867 #endif
868 
869  !$acc end data
870  !$acc end data
871 
872  return
873  end subroutine matrix_solver_tridiagonal_3d
874 
875  !-----------------------------------------------------------------------------
876  ! eigenvalue decomposition
877  !-----------------------------------------------------------------------------
878  subroutine matrix_solver_eigenvalue_decomposition(n,a,eival,eivec,simdlen)
879  use scale_prc, only: &
880  prc_abort
881  implicit none
882 
883  integer, intent(in) :: n ! dimension of matrix
884  real(rp), intent(in) :: a (n,n) ! input matrix
885  real(rp), intent(out) :: eival(n) ! eiven values in decending order. i.e. eival(1) is the largest
886  real(rp), intent(out) :: eivec(n,n) ! eiven vectors
887 
888  integer, intent(in), optional :: simdlen
889 
890  real(rp) :: eival_inc
891 
892  real(rp), allocatable :: b (:,:)
893  real(rp), allocatable :: w (:)
894  real(rp), allocatable :: work (:) ! working array, size is lwork = 2*n*n + 6*n + 1
895  integer, allocatable :: iwork(:) ! working array, size is liwork = 5*n + 3
896 
897  integer :: simdlen_
898  integer :: nrank_eff
899  integer :: lda
900  integer :: lwork
901  integer :: liwork
902 
903  integer :: iblk, jblk
904  integer :: imax, jmax
905  integer :: ivec, jvec
906 
907  integer :: i, j, ierr
908  !---------------------------------------------------------------------------
909 
910 #ifdef DA
911  if( present(simdlen) ) then
912  simdlen_ = simdlen
913  else
914  simdlen_ = 64
915  end if
916 
917  lda = n
918 
919  lwork = 2*n*n + 6*n + 1
920  liwork = 5*n + 3
921 
922  allocate( b(lda,n) )
923  allocate( w(lda) )
924  allocate( work(lwork) )
925  allocate( iwork(liwork) )
926 
927  do jblk = 1, n, simdlen_
928  jmax = min( n-jblk+1, simdlen_ )
929  do iblk = 1, n, simdlen_
930  imax = min( n-iblk+1, simdlen_ )
931 
932  do jvec = 1, jmax
933  j = jblk + jvec - 1
934  do ivec = 1, imax
935  i = iblk + ivec - 1
936 
937  b(i,j) = 0.5_rp * ( a(i,j) + a(j,i) )
938  end do
939  end do
940  end do
941  end do
942 
943  ! use the SSYEVD/DSYEVD subroutine in LAPACK
944  if( rp == sp ) then
945  call ssyevd("V","L",n,b,lda,w,work,lwork,iwork,liwork,ierr)
946  else
947  call dsyevd("V","L",n,b,lda,w,work,lwork,iwork,liwork,ierr)
948  end if
949 
950  if( ierr /= 0 ) then
951  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) 'LAPACK/SYEVD error code is ', ierr
952  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) 'input a'
953  do j = 1, n
954  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) j ,a(:,j)
955  enddo
956  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) 'output eival'
957  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) w(:)
958  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) 'output eivec'
959  do j = 1, n
960  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) j, b(:,j)
961  enddo
962  log_info('MATRIX_SOLVER_eigenvalue_decomposition',*) 'Try to use LAPACK/SYEV ...'
963 
964  ! Temporary treatment because of the instability of SYEVD
965  do jblk = 1, n, simdlen_
966  jmax = min( n-jblk+1, simdlen_ )
967  do iblk = 1, n, simdlen_
968  imax = min( n-iblk+1, simdlen_ )
969 
970  do jvec = 1, jmax
971  j = jblk + jvec - 1
972  do ivec = 1, imax
973  i = iblk + ivec - 1
974 
975  b(i,j) = 0.5_rp * ( a(i,j) + a(j,i) )
976  end do
977  end do
978  end do
979  end do
980 
981  ! use SSYEV/DSYEV subroutine in LAPACK
982  if( rp == sp ) then
983  call ssyev("V","L",n,b,lda,w,work,lwork,ierr)
984  else
985  call dsyev("V","L",n,b,lda,w,work,lwork,ierr)
986  end if
987 
988  if ( ierr /= 0 ) then
989  log_error('MATRIX_SOLVER_eigenvalue_decomposition',*) 'LAPACK/SYEV error code is ', ierr, '! STOP.'
990  call prc_abort
991  endif
992  endif
993 
994  if( w(1) < 0.0_rp ) then
995  eival_inc = w(n) / ( 1.e+5_rp - 1.0_rp )
996  do i = 1, n
997  w(i) = w(i) + eival_inc
998  enddo
999  else if( w(n)/1.e+5_rp > w(1) ) then
1000  eival_inc = ( w(n) - w(1)*1.e+5_rp ) / ( 1.e+5_rp - 1.0_rp )
1001  do i = 1, n
1002  w(i) = w(i) + eival_inc
1003  end do
1004  end if
1005 
1006  ! reverse order
1007  do i = 1, n
1008  eival(i) = w(i)
1009  enddo
1010  do j = 1, n
1011  do i = 1, n
1012  eivec(i,j) = b(i,j)
1013  enddo
1014  enddo
1015 
1016  nrank_eff = n
1017 
1018  if( eival(n) > 0.0_rp ) then
1019  do i = 1, n
1020  if( eival(i) < abs(eival(n))*sqrt(epsilon(eival)) ) then
1021  nrank_eff = nrank_eff - 1
1022  eival(i) = 0.0_rp
1023  eivec(:,i) = 0.0_rp
1024  end if
1025  end do
1026  else
1027  ! check zero
1028  log_error('MATRIX_SOLVER_eigenvalue_decomposition',*) 'All eigenvalues are below 0! STOP.'
1029  call prc_abort
1030  endif
1031 
1032  deallocate( b )
1033  deallocate( w )
1034  deallocate( work )
1035  deallocate( iwork )
1036 #else
1037  log_error('MATRIX_SOLVER_eigenvalue_decomposition',*) 'Binary not compiled for DA! STOP.'
1038  call prc_abort
1039 #endif
1040 
1041  return
1043 
1044 end module scale_matrix
scale_precision::sp
integer, parameter, public sp
Definition: scale_precision.F90:31
scale_matrix::matrix_solver_tridiagonal_1d_ta
subroutine, public matrix_solver_tridiagonal_1d_ta( KA, KS, KE, ifdef _OPENACC
solve tridiagonal matrix with Thomas's algorithm
Definition: scale_matrix.F90:133
scale_prc::prc_abort
subroutine, public prc_abort
Abort Process.
Definition: scale_prc.F90:350
scale_precision
module PRECISION
Definition: scale_precision.F90:14
scale_matrix::matrix_solver_tridiagonal_1d_cr
subroutine, public matrix_solver_tridiagonal_1d_cr( KA, KS, KE, ifdef _OPENACC
solve tridiagonal matrix with Cyclic Reduction method
Definition: scale_matrix.F90:186
scale_matrix::matrix_solver_eigenvalue_decomposition
subroutine, public matrix_solver_eigenvalue_decomposition(n, a, eival, eivec, simdlen)
Definition: scale_matrix.F90:879
scale_prc
module PROCESS
Definition: scale_prc.F90:11
scale_io
module STDIO
Definition: scale_io.F90:10
scale_matrix::matrix_finalize
subroutine, public matrix_finalize
Finalize.
Definition: scale_matrix.F90:117
scale_matrix::matrix_solver_tridiagonal_1d_pcr
subroutine, public matrix_solver_tridiagonal_1d_pcr( KA, KS, KE, ifdef _OPENACC
solve tridiagonal matrix with Parallel Cyclic Reduction method
Definition: scale_matrix.F90:296
scale_prof
module profiler
Definition: scale_prof.F90:11
scale_matrix
module MATRIX
Definition: scale_matrix.F90:17
scale_matrix::matrix_solver_tridiagonal_2d_block
subroutine matrix_solver_tridiagonal_2d_block(KA, KS, KE, ud, md, ld, iv, ov)
Definition: scale_matrix.F90:536
scale_matrix::matrix_setup
subroutine, public matrix_setup
Setup.
Definition: scale_matrix.F90:80