67 logical,
intent(in) :: EXECUTE_PREPROCESS
68 logical,
intent(in) :: EXECUTE_MODEL
70 integer :: NUM_BULKJOB = 1
71 integer :: NUM_BULKJOB_ONCE = 1
72 logical :: BULKJOB = .false.
73 integer :: NUM_ITERATION_BULK = 1
74 integer :: BULKJOB_START_DIRNUM = 0
75 logical :: ADD_BULKJOB_PATH = .false.
76 integer :: NUM_DOMAIN = 1
77 integer :: NUM_FAIL_TOLERANCE = 1
78 integer :: FREQ_FAIL_CHECK = 5
79 integer :: PRC_DOMAINS(PRC_DOMAIN_nlim) = 0
80 character(len=H_LONG) :: CONF_FILES (PRC_DOMAIN_nlim) =
""
81 logical :: ABORT_ALL_JOBS = .false.
82 logical :: LOG_SPLIT = .false.
83 logical :: COLOR_REORDER = .true.
84 logical :: FAILURE_PRC_MANAGE = .false.
86 namelist / param_launcher / &
92 bulkjob_start_dirnum, &
104 integer :: universal_comm
105 integer :: universal_nprocs
106 integer :: universal_myrank
107 logical :: universal_master
108 character(len=H_LONG) :: universal_cnf_fname
110 integer :: global_comm
111 integer :: global_nprocs
112 integer :: PRC_BULKJOB(PRC_DOMAIN_nlim) = 0
113 integer :: ID_BULKJOB
115 logical :: use_fpm = .false.
117 integer :: local_comm
119 integer :: intercomm_parent
120 integer :: intercomm_child
121 character(len=5) :: path
137 integer :: ngpus, gpuid
138 ngpus = acc_get_num_devices(acc_device_nvidia)
139 if( universal_master )
write(*,*)
"*** Number of GPUs: ", min(ngpus, universal_nprocs)
140 if ( ngpus > 0 )
then
141 gpuid = mod(universal_myrank, ngpus)
142 call acc_set_device_num(gpuid, acc_device_nvidia)
147 if( universal_master )
write(*,*)
'*** Start Launch System for SCALE-RM'
151 universal_cnf_fname = io_arg_getfname( universal_master )
153 fid = io_cnf_open( universal_cnf_fname, &
157 conf_files(1) = universal_cnf_fname
161 read(fid,nml=param_launcher,iostat=ierr)
164 elseif( ierr > 0 )
then
165 if( universal_master )
write(*,*)
'xxx Not appropriate names in namelist PARAM_LAUNCHER. Check!'
171 if ( execute_preprocess &
172 .OR. execute_model )
then
173 if( universal_master )
write(*,*)
"*** Execute preprocess? : ", execute_preprocess
174 if( universal_master )
write(*,*)
"*** Execute model? : ", execute_model
176 if( universal_master )
write(*,*)
'xxx No execution. please check PARAM_LAUNCHER. STOP'
182 bulkjob = bulkjob .or. ( num_bulkjob > 1 )
184 global_nprocs = universal_nprocs
187 if ( num_bulkjob == 1 ) num_iteration_bulk = 1
188 num_bulkjob_once = ceiling( real(num_bulkjob) / num_iteration_bulk )
189 if ( mod(universal_nprocs,num_bulkjob_once) /= 0 )
then
190 if( universal_master )
write(*,*)
'xxx Total Num of Processes must be divisible by NUM_BULKJOB/NUM_ITERATION_BULK. Check!'
191 if( universal_master )
write(*,*)
'xxx Total Num of Processes = ', universal_nprocs
192 if( universal_master )
write(*,*)
'xxx NUM_BULKJOB = ', num_bulkjob
193 if( universal_master )
write(*,*)
'xxx NUM_ITERATION_BULK = ', num_iteration_bulk
194 if( universal_master )
write(*,*)
'xxx NUM_BULKJOB / NUM_ITERATION_BULK = ', num_bulkjob_once
198 global_nprocs = universal_nprocs / num_bulkjob_once
200 if( universal_master )
write(*,
'(1x,A,I5)')
"*** TOTAL # of BULK JOBS = ", num_bulkjob
201 if( universal_master )
write(*,
'(1x,A,I5)')
"*** # of BULK JOB for each iteration = ", num_bulkjob_once
202 if( universal_master )
write(*,
'(1x,A,I5)')
"*** Total # of PROCESS = ", universal_nprocs
203 if( universal_master )
write(*,
'(1x,A,I5)')
"*** # of PROCESS of each JOB = ", global_nprocs
205 if ( bulkjob_start_dirnum < 0 )
then
206 if( universal_master )
write(*,*)
'xxx BULKJOB_START_DIRNUM must >=0'
209 if ( bulkjob_start_dirnum + num_bulkjob -1 > 9999 )
then
210 if( universal_master )
write(*,*)
'xxx BULKJOB_START_DIRNUM + NUM_BULKJOB must <= 9999'
214 if ( failure_prc_manage )
then
215 if( universal_master )
write(*,
'(1x,A)')
"*** Available: Failure Process Management"
217 if ( num_fail_tolerance <= 0 )
then
218 if( universal_master )
write(*,*)
'xxx Num of Failure Processes must be positive number. Check!'
219 if( universal_master )
write(*,*)
'xxx NUM_FAIL_TOLERANCE = ', num_fail_tolerance
223 if ( num_fail_tolerance > num_bulkjob )
then
224 write(*,*)
'xxx NUM_FAIL_TOLERANCE is bigger than # of NUM_BLUKJOB'
225 write(*,*)
' set to be: NUM_FAIL_TOLERANCE <= NUM_BLUKJOB'
229 if ( num_domain > 1 )
then
230 if ( freq_fail_check >= 1 .or. num_fail_tolerance /= num_bulkjob )
then
231 write(*,*)
'xxx Full function of FPM is not available with online nesting.'
232 write(*,*)
' You can use this only to avoid job stop until all members finish.'
233 write(*,*)
' for this purpose, set: FREQ_FAIL_CHECK = 0'
234 write(*,*)
' NUM_FAIL_TOLERANCE == NUM_BULKJOB'
240 prc_bulkjob(1:num_bulkjob) = global_nprocs
255 if ( num_domain > 1 )
then
256 if( universal_master )
write(*,
'(1x,A,I5)')
"*** TOTAL DOMAIN NUMBER = ", num_domain
257 if( universal_master )
write(*,
'(1x,A,L5)')
"*** Flag of ABORT ALL JOBS = ", abort_all_jobs
270 call fpm_init( num_fail_tolerance, &
283 do itr = 1, num_iteration_bulk
285 if ( id_bulkjob > num_bulkjob )
exit
287 if( universal_master .and. num_iteration_bulk > 1 )
then
289 write(*,*)
'*** BULK ITERATION COUNT : ', itr,
'/', num_iteration_bulk
293 call io_set_universalrank( universal_myrank, &
298 write(path,
'(I4.4,A)') id_bulkjob + bulkjob_start_dirnum,
"/"
303 if ( execute_preprocess )
then
305 conf_files(id_domain), &
310 if ( execute_model )
then
312 conf_files(id_domain), &
317 id_bulkjob = id_bulkjob + num_bulkjob_once
324 if( universal_master )
write(*,*)
'*** End Launch System for SCALE-RM'