SCALE-RM
mod_launcher.F90
Go to the documentation of this file.
1 !-------------------------------------------------------------------------------
11 !-------------------------------------------------------------------------------
13  !-----------------------------------------------------------------------------
14  !
15  !++ used modules
16  !
17  use scale_precision
18  use scale_io
19  use scale_prof
20 
21  !-----------------------------------------------------------------------------
22  implicit none
23  private
24  !-----------------------------------------------------------------------------
25  !
26  !++ included parameters
27  !
28  !-----------------------------------------------------------------------------
29  !
30  !++ Public procedure
31  !
32  public :: launcher
33 
34  !-----------------------------------------------------------------------------
35  !
36  !++ parameters & variables
37  !
38  !=============================================================================
39 
40 contains
41  subroutine launcher( &
42  EXECUTE_PREPROCESS, &
43  EXECUTE_MODEL )
44  use scale_prc, only: &
46  prc_comm_null, &
47  prc_abort, &
48  prc_mpistart, &
49  prc_mpifinish, &
56  use scale_fpm, only: &
57  fpm_init
58  use mod_rm_prep, only: &
59  rm_prep
60  use mod_rm_driver, only: &
61  rm_driver
62 #ifdef _OPENACC
63  use openacc
64 #endif
65  implicit none
66 
67  logical, intent(in) :: execute_preprocess ! execute preprocess tools?
68  logical, intent(in) :: execute_model ! execute main model?
69 
70  integer :: num_bulkjob = 1 ! number of bulk job
71  integer :: num_bulkjob_once = 1 ! number of bulk job for one iteration
72  logical :: bulkjob = .false.
73  integer :: num_iteration_bulk = 1 ! number of iteration for bulk job
74  integer :: bulkjob_start_dirnum = 0 ! start number of directory for bulk job
75  logical :: add_bulkjob_path = .false. ! add path of the bulk job to files
76  integer :: num_domain = 1 ! number of domains
77  integer :: num_fail_tolerance = 1 ! tolerance number of failure processes
78  integer :: freq_fail_check = 5 ! FPM polling frequency per DT (0: no polling)
79  integer :: prc_domains(prc_domain_nlim) = 0 ! number of total process in each domain
80  character(len=H_LONG) :: conf_files (prc_domain_nlim) = "" ! name of configulation files
81  logical :: abort_all_jobs = .false. ! abort all jobs or not?
82  logical :: log_split = .false. ! log-output for mpi splitting?
83  logical :: color_reorder = .true. ! coloring reorder for mpi splitting?
84  logical :: failure_prc_manage = .false. ! use failure process management?
85 
86  namelist / param_launcher / &
87 ! EXECUTE_PREPROCESS, &
88 ! EXECUTE_MODEL, &
89  num_bulkjob, &
90  bulkjob, &
91  num_iteration_bulk, &
92  bulkjob_start_dirnum, &
93  add_bulkjob_path, &
94  num_domain, &
95  num_fail_tolerance, &
96  freq_fail_check, &
97  prc_domains, &
98  conf_files, &
99  abort_all_jobs, &
100  log_split, &
101  color_reorder, &
102  failure_prc_manage
103 
104  integer :: universal_comm ! universal communicator
105  integer :: universal_nprocs ! number of procs in universal communicator
106  integer :: universal_myrank ! my rank in universal communicator
107  logical :: universal_master ! master process in universal communicator?
108  character(len=H_LONG) :: universal_cnf_fname ! config file for launcher
109 
110  integer :: global_comm ! communicator for each member
111  integer :: global_nprocs ! number of procs in global communicator
112  integer :: prc_bulkjob(prc_domain_nlim) = 0 ! number of procs in each bulk job = global_nprocs
113  integer :: id_bulkjob ! bulk job ID
114 
115  logical :: use_fpm = .false. ! switch for fpm module
116 
117  integer :: local_comm ! assigned local communicator
118  integer :: id_domain ! domain ID
119  integer :: intercomm_parent ! inter communicator with parent
120  integer :: intercomm_child ! inter communicator with child
121  character(len=5) :: path ! path to config file for local domain
122 
123  integer :: itr
124  integer :: fid, ierr
125  !-----------------------------------------------------------
126 
127  ! start MPI
128  call prc_mpistart( universal_comm ) ! [OUT]
129 
130  call prc_universal_setup( universal_comm, & ! [IN]
131  universal_nprocs, & ! [OUT]
132  universal_myrank, & ! [OUT]
133  universal_master ) ! [OUT]
134 
135 #ifdef _OPENACC
136  block
137  integer :: ngpus, gpuid
138  ngpus = acc_get_num_devices(acc_device_nvidia)
139  if( universal_master ) write(*,*) "*** Number of GPUs: ", min(ngpus, universal_nprocs)
140  if ( ngpus > 0 ) then
141  gpuid = mod(universal_myrank, ngpus)
142  call acc_set_device_num(gpuid, acc_device_nvidia)
143  end if
144  end block
145 #endif
146 
147  if( universal_master ) write(*,*) '*** Start Launch System for SCALE-RM'
148 
149  !--- read launcher config
150 
151  universal_cnf_fname = io_arg_getfname( universal_master )
152 
153  fid = io_cnf_open( universal_cnf_fname, & ! [IN]
154  universal_master ) ! [IN]
155 
156  ! set default
157  conf_files(1) = universal_cnf_fname
158 
159  ! read namelist
160  rewind(fid)
161  read(fid,nml=param_launcher,iostat=ierr)
162  if ( ierr < 0 ) then !--- missing
163  ! keep default setting (no members, no nesting)
164  elseif( ierr > 0 ) then !--- fatal error
165  if( universal_master ) write(*,*) 'xxx Not appropriate names in namelist PARAM_LAUNCHER. Check!'
166  call prc_abort
167  endif
168 
169  close(fid)
170 
171  if ( execute_preprocess &
172  .OR. execute_model ) then
173  if( universal_master ) write(*,*) "*** Execute preprocess? : ", execute_preprocess
174  if( universal_master ) write(*,*) "*** Execute model? : ", execute_model
175  else
176  if( universal_master ) write(*,*) 'xxx No execution. please check PARAM_LAUNCHER. STOP'
177  call prc_abort
178  endif
179 
180  !--- split for bulk jobs
181 
182  bulkjob = bulkjob .or. ( num_bulkjob > 1 )
183 
184  global_nprocs = universal_nprocs
185  if ( bulkjob ) then
186 
187  if ( num_bulkjob == 1 ) num_iteration_bulk = 1
188  num_bulkjob_once = ceiling( real(num_bulkjob) / num_iteration_bulk )
189  if ( mod(universal_nprocs,num_bulkjob_once) /= 0 ) then !--- fatal error
190  if( universal_master ) write(*,*) 'xxx Total Num of Processes must be divisible by NUM_BULKJOB/NUM_ITERATION_BULK. Check!'
191  if( universal_master ) write(*,*) 'xxx Total Num of Processes = ', universal_nprocs
192  if( universal_master ) write(*,*) 'xxx NUM_BULKJOB = ', num_bulkjob
193  if( universal_master ) write(*,*) 'xxx NUM_ITERATION_BULK = ', num_iteration_bulk
194  if( universal_master ) write(*,*) 'xxx NUM_BULKJOB / NUM_ITERATION_BULK = ', num_bulkjob_once
195  call prc_abort
196  endif
197 
198  global_nprocs = universal_nprocs / num_bulkjob_once
199 
200  if( universal_master ) write(*,'(1x,A,I5)') "*** TOTAL # of BULK JOBS = ", num_bulkjob
201  if( universal_master ) write(*,'(1x,A,I5)') "*** # of BULK JOB for each iteration = ", num_bulkjob_once
202  if( universal_master ) write(*,'(1x,A,I5)') "*** Total # of PROCESS = ", universal_nprocs
203  if( universal_master ) write(*,'(1x,A,I5)') "*** # of PROCESS of each JOB = ", global_nprocs
204 
205  if ( bulkjob_start_dirnum < 0 ) then
206  if( universal_master ) write(*,*) 'xxx BULKJOB_START_DIRNUM must >=0'
207  call prc_abort
208  end if
209  if ( bulkjob_start_dirnum + num_bulkjob -1 > 9999 ) then
210  if( universal_master ) write(*,*) 'xxx BULKJOB_START_DIRNUM + NUM_BULKJOB must <= 9999'
211  call prc_abort
212  end if
213 
214  if ( failure_prc_manage ) then
215  if( universal_master ) write(*,'(1x,A)') "*** Available: Failure Process Management"
216  use_fpm = .true. !--- available only in bulk job
217  if ( num_fail_tolerance <= 0 ) then !--- fatal error
218  if( universal_master ) write(*,*) 'xxx Num of Failure Processes must be positive number. Check!'
219  if( universal_master ) write(*,*) 'xxx NUM_FAIL_TOLERANCE = ', num_fail_tolerance
220  call prc_abort
221  endif
222 
223  if ( num_fail_tolerance > num_bulkjob ) then !--- fatal error
224  write(*,*) 'xxx NUM_FAIL_TOLERANCE is bigger than # of NUM_BLUKJOB'
225  write(*,*) ' set to be: NUM_FAIL_TOLERANCE <= NUM_BLUKJOB'
226  call prc_abort
227  endif
228 
229  if ( num_domain > 1 ) then !--- avoid error in the current implementation
230  if ( freq_fail_check >= 1 .or. num_fail_tolerance /= num_bulkjob ) then
231  write(*,*) 'xxx Full function of FPM is not available with online nesting.'
232  write(*,*) ' You can use this only to avoid job stop until all members finish.'
233  write(*,*) ' for this purpose, set: FREQ_FAIL_CHECK = 0'
234  write(*,*) ' NUM_FAIL_TOLERANCE == NUM_BULKJOB'
235  call prc_abort
236  endif
237  endif
238  endif
239  endif
240  prc_bulkjob(1:num_bulkjob) = global_nprocs
241 
242  ! communicator split for bulk/ensemble
243  call prc_mpisplit_bulk( universal_comm, & ! [IN]
244  num_bulkjob_once, & ! [IN]
245  prc_bulkjob(:), & ! [IN]
246  log_split, & ! [IN]
247  global_comm, & ! [OUT]
248  id_bulkjob ) ! [OUT]
249 
250  call prc_global_setup( abort_all_jobs, & ! [IN]
251  global_comm ) ! [IN]
252 
253  !--- split for nesting
254 
255  if ( num_domain > 1 ) then
256  if( universal_master ) write(*,'(1x,A,I5)') "*** TOTAL DOMAIN NUMBER = ", num_domain
257  if( universal_master ) write(*,'(1x,A,L5)') "*** Flag of ABORT ALL JOBS = ", abort_all_jobs
258  endif
259 
260  ! communicator split for nesting domains
261  call prc_mpisplit_nest( global_comm, & ! [IN]
262  num_domain, & ! [IN]
263  prc_domains(:), & ! [IN]
264  log_split, & ! [IN]
265  color_reorder, & ! [IN]
266  local_comm, & ! [OUT]
267  id_domain ) ! [OUT]
268 
269  !--- initialize FPM module & error handler
270  call fpm_init( num_fail_tolerance, & ! [IN]
271  freq_fail_check, & ! [IN]
272  universal_comm, & ! [IN]
273  global_comm, & ! [IN]
274  local_comm, & ! [IN]
275  num_bulkjob, & ! [IN]
276  prc_global_root, & ! [IN]
277  use_fpm ) ! [IN]
278 
279  call prc_errhandler_setup( use_fpm, universal_master )
280 
281  !--- start main routine
282 
283  do itr = 1, num_iteration_bulk
284 
285  if ( id_bulkjob > num_bulkjob ) exit
286 
287  if( universal_master .and. num_iteration_bulk > 1 ) then
288  write(*,*)
289  write(*,*) '*** BULK ITERATION COUNT : ', itr, '/', num_iteration_bulk
290  end if
291 
292 
293  call io_set_universalrank( universal_myrank, & ! [IN]
294  id_bulkjob, & ! [IN]
295  id_domain ) ! [IN]
296 
297  if ( bulkjob ) then
298  write(path,'(I4.4,A)') id_bulkjob + bulkjob_start_dirnum, "/"
299  else
300  path = ""
301  endif
302 
303  if ( execute_preprocess ) then
304  call rm_prep( local_comm, & ! [IN]
305  conf_files(id_domain), & ! [IN]
306  path, & ! [IN]
307  add_bulkjob_path ) ! [IN]
308  endif
309 
310  if ( execute_model ) then
311  call rm_driver( local_comm, & ! [IN]
312  conf_files(id_domain), & ! [IN]
313  path, & ! [IN]
314  add_bulkjob_path ) ! [IN]
315  endif
316 
317  id_bulkjob = id_bulkjob + num_bulkjob_once
318 
319  end do
320 
321  ! stop MPI
322  call prc_mpifinish
323 
324  if( universal_master ) write(*,*) '*** End Launch System for SCALE-RM'
325 
326  end subroutine launcher
327 
328 end module mod_launcher
scale_prc::prc_abort
subroutine, public prc_abort
Abort Process.
Definition: scale_prc.F90:350
scale_prc::prc_universal_setup
subroutine, public prc_universal_setup(comm, nprocs, myrank, ismaster)
setup MPI in universal communicator
Definition: scale_prc.F90:154
scale_prc::prc_mpisplit_bulk
subroutine, public prc_mpisplit_bulk(ORG_COMM_WORLD, NUM_BULKJOB, PRC_BULKJOB, debug, SUB_COMM_WORLD, ID_BULKJOB)
MPI Communicator Split (bulk job)
Definition: scale_prc.F90:419
scale_prc::prc_global_setup
subroutine, public prc_global_setup(abortall, comm)
setup MPI in global communicator
Definition: scale_prc.F90:194
scale_io::io_set_universalrank
subroutine, public io_set_universalrank(myrank, jobid, domainid)
Put for error log.
Definition: scale_io.F90:397
scale_precision
module PRECISION
Definition: scale_precision.F90:14
scale_prc::prc_mpisplit_nest
subroutine, public prc_mpisplit_nest(ORG_COMM_WORLD, NUM_DOMAIN, PRC_DOMAIN, debug, color_reorder, SUB_COMM_WORLD, ID_DOMAIN)
MPI Communicator Split (nesting)
Definition: scale_prc.F90:522
mod_launcher::launcher
subroutine, public launcher(EXECUTE_PREPROCESS, EXECUTE_MODEL)
Definition: mod_launcher.F90:44
scale_prc::prc_mpistart
subroutine, public prc_mpistart(comm)
Start MPI.
Definition: scale_prc.F90:128
mod_rm_driver::rm_driver
subroutine, public rm_driver(comm_world, cnf_fname, path, add_path)
Setup.
Definition: mod_rm_driver.F90:59
scale_fpm::fpm_init
subroutine, public fpm_init(max_failure, polling_freq, universal_comm, global_comm, local_comm, num_member, global_root, use_fpm)
Initialize FPM.
Definition: scale_fpm.F90:79
mod_rm_prep
module SCALE-RM prep
Definition: mod_rm_prep.F90:14
scale_prc
module PROCESS
Definition: scale_prc.F90:11
scale_io
module STDIO
Definition: scale_io.F90:10
scale_prc::prc_comm_null
integer, parameter, public prc_comm_null
Definition: scale_prc.F90:69
scale_prc::prc_errhandler_setup
subroutine, public prc_errhandler_setup(use_fpm, master)
Setup MPI error handler.
Definition: scale_prc.F90:313
scale_prc::prc_global_root
integer, dimension(prc_domain_nlim), public prc_global_root
root processes in global members
Definition: scale_prc.F90:86
scale_prof
module profiler
Definition: scale_prof.F90:11
scale_prc::prc_domain_nlim
integer, parameter, public prc_domain_nlim
max depth of domains
Definition: scale_prc.F90:68
mod_launcher
module Launcher(a launcher of main routine)
Definition: mod_launcher.F90:12
mod_rm_driver
module SCALE-RM (a main routine of regional model)
Definition: mod_rm_driver.F90:13
scale_fpm
module FPM
Definition: scale_fpm.F90:10
mod_rm_prep::rm_prep
subroutine, public rm_prep(comm_world, cnf_fname, path, add_path)
Setup.
Definition: mod_rm_prep.F90:60
scale_prc::prc_mpifinish
subroutine, public prc_mpifinish
Stop MPI peacefully.
Definition: scale_prc.F90:366
scale_io::io_arg_getfname
character(len=h_long) function, public io_arg_getfname(is_master, allow_noconf)
get config filename from argument
Definition: scale_io.F90:468
scale_io::io_cnf_open
integer function, public io_cnf_open(fname, is_master)
open config file
Definition: scale_io.F90:500