zenobe.cfg.tmpl 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. # Platform dependent configuration functions for the 'zenobe' machine
  2. #(zenobe.hpc.cenaero.be)
  3. function configure()
  4. {
  5. # This function should configure all settings/modules needed to
  6. # later prepare the EC-Earth run directory and set variables used
  7. # in the run script
  8. # SCRATCH is not defined in MN3, define it here
  9. # and also make sure it is defined when compiling
  10. export SCRATCH=/SCRATCH/acad/ecearth/${USER}
  11. # Configure paths for building/running EC-Earth
  12. ecearth_src_dir=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]]
  13. run_dir=[[[PLT:ACTIVE:RUN_DIR]]]
  14. ini_data_dir=[[[PLT:ACTIVE:INI_DATA_DIR]]]
  15. archive_dir=/SCRATCH/acad/ecearth/${USER}/archive/${exp_name}
  16. # File for standard output.
  17. # NOTE: This will be modified for restart jobs!
  18. stdout_file=${PBS_O_WORKDIR-$PWD}/${PBS_JOBNAME-"local"}_${PBS_JOBID-"id"}.log
  19. # Resubmit this job for automatic restarts? [true/false]
  20. # Also, add options for the resubmit command here.
  21. resubmit_job=[[[PLT:ACTIVE:RESUBMIT_JOB]]]
  22. resubmit_opt="[[[PLT:ACTIVE:RESUBMIT_OPT]]]"
  23. # Configure GRIBEX paths
  24. export LOCAL_DEFINITION_TEMPLATES=[[[PLT:ACTIVE:GRIBEX_DEFINITION_PATH]]]
  25. export ECMWF_LOCAL_TABLE_PATH=[[[PLT:ACTIVE:GRIBEX_DEFINITION_PATH]]]
  26. # Configure grib api paths
  27. export GRIB_DEFINITION_PATH=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]]/util/grib_table_126:[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_DEFINITION_SUBDIR]]]
  28. export GRIB_SAMPLES_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_SAMPLES_SUBDIR]]]
  29. export GRIB_BIN_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_BIN_SUBDIR]]]
  30. # Configure number of processors per node
  31. proc_per_node=[[[PLT:ACTIVE:PROC_PER_NODE]]]
  32. # Configure and load modules
  33. pre_load_modules_cmd="[[[PLT:ACTIVE:PRE_LOAD_MODULES_CMD]]]"
  34. module_list="[[[PLT:ACTIVE:MODULE_LIST]]]"
  35. if [ -n "${pre_load_modules_cmd}" ]
  36. then
  37. ${pre_load_modules_cmd}
  38. fi
  39. if [ -n "${module_list}" ]
  40. then
  41. module load ${module_list}
  42. fi
  43. # Add directories to the shared library search path
  44. if [ -n "[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]" ]
  45. then
  46. export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}"[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]"
  47. fi
  48. # Use machinefiles or not
  49. [[ `echo "$use_machinefile" | tr '[:upper:]' '[:lower:]'` == true ]] && use_machinefile=true || use_machinefile=false
  50. ulimit -s unlimited
  51. # Load specific IMPI environment configuration
  52. configure_impi
  53. }
  54. function configure_python()
  55. {
  56. # specific for python+eccodes setup - used for OSM pre/post-processing
  57. # it would be simple to do the following in configure
  58. # module load eccodes/2.8.0 python/2.7.13
  59. module load eccodes/2.8.0 python/2.7.13
  60. unset GRIB_DEFINITION_PATH
  61. unset GRIB_SAMPLES_PATH
  62. unset GRIB_BIN_PATH
  63. export GRIB_BIN_PATH=/apps/ECCODES/2.8.0/INTEL/bin
  64. }
  65. function configure_impi()
  66. {
  67. [ -z "${OMP_NUM_THREADS-}" ] && export OMP_NUM_THREADS=1
  68. export I_MPI_DEBUG=5
  69. }
  70. function get_hosts()
  71. {
  72. # This function uses a scheduler command to get the hosts allocated for the current job
  73. hosts=(`scontrol show hostname | paste -s`)
  74. }
  75. function machinefile_config()
  76. {
  77. # User configuration starts here
  78. # hard-coded c4mip configurations, must use the proper _numproc settings
  79. if has_config ifs nemo pisces rnfmapper xios lpjg ; then
  80. if ! has_config tm5 ; then
  81. ifs_ppn=48 ; [[ ${ifs_numproc} != 336 ]] && info "wrong numproc setting for ifs in machinefile_config" || true
  82. nem_ppn=43 ; [[ ${nem_numproc} != 380 ]] && info "wrong numproc setting for nemo in machinefile_config" || true
  83. xio_ppn=5 ; [[ ${xio_numproc} != 5 ]] && info "wrong numproc setting for xios in machinefile_config" || true
  84. lpjg_ppn=5 ; [[ ${lpjg_numproc} != 40 ]] && info "wrong numproc setting for lpjg in machinefile_config" || true
  85. else
  86. ifs_ppn=48 ; [[ ${ifs_numproc} != 256 ]] && info "wrong numproc setting for ifs in machinefile_config" || true
  87. nem_ppn=46 ; [[ ${nem_numproc} != 192 ]] && info "wrong numproc setting for nemo in machinefile_config" || true
  88. xio_ppn=2 ; [[ ${xio_numproc} != 2 ]] && info "wrong numproc setting for xios in machinefile_config" || true
  89. lpjg_ppn=2 ; [[ ${lpjg_numproc} != 8 ]] && info "wrong numproc setting for lpjg in machinefile_config" || true
  90. tm5_ppn=4 ; [[ ${tm5_numproc} != 4 ]] && info "wrong numproc setting for tm5 in machinefile_config" || true
  91. fi
  92. else
  93. # Add any new exclusive binary here
  94. ifs_exc=TRUE
  95. nem_exc=TRUE
  96. xio_exc=TRUE
  97. lpjg_exc=TRUE
  98. tm5_exc=TRUE
  99. # Modify the allocation to each binary using more than one process here
  100. ifs_ppn=48
  101. nem_ppn=48
  102. xio_ppn=48
  103. lpjg_ppn=48
  104. tm5_ppn=45
  105. fi
  106. }
  107. function machinefile_init()
  108. {
  109. # Get max processes per node from the platform variable
  110. max_ppn=$proc_per_node
  111. components=( ifs nem xio rnf amip lpjg )
  112. if $(has_config tm5)
  113. then
  114. components=( "${components[@]}" "tm5" )
  115. fi
  116. for component in ${components[@]}
  117. do
  118. eval ${component}_exc=FALSE
  119. eval ${component}_ppn=1
  120. done
  121. # Call user configuration and get_host functions
  122. machinefile_config
  123. get_hosts
  124. # Declare array to store the processes as they are assigned
  125. declare -a -g processes_hosts
  126. for n in `seq 0 ${#hosts[@]}`
  127. do
  128. processes_hosts[$n]=0
  129. done
  130. > machinefile
  131. current_hostid=0
  132. }
  133. machinefile_find_available_node()
  134. {
  135. while [ $((${processes_hosts[$current_hostid]} + ${!ppn})) -gt $max_ppn ]
  136. do
  137. let "current_hostid += 1"
  138. done
  139. }
  140. machinefile_add()
  141. {
  142. total_proc=$2
  143. # Iterate through all the possible binaries
  144. for component in ${components[@]}
  145. do
  146. binary="${component}_exe_file"
  147. exclusive="${component}_exc"
  148. # Check if the current binary matches the input executable
  149. if [ ./$(basename ${!binary}) = "$1" ]
  150. then
  151. ppn="${component}_ppn"
  152. # Exclusive mode: start allocation at the first empty node
  153. if [[ ${!exclusive} == "TRUE" ]]
  154. then
  155. while [ ${processes_hosts[$current_hostid]} -gt 0 ]
  156. do
  157. let "current_hostid += 1"
  158. done
  159. # Shared mode: start allocation in the first node with enough free cores
  160. # Notice that only the first node is checked
  161. # Then, if a previous binary had "exc=TRUE", allocation space is not ensure in subsequent nodes
  162. else
  163. current_hostid=0
  164. machinefile_find_available_node
  165. fi
  166. # Allocate ppn cores in each of the subsequent nodes till there are no more processes to assign
  167. count=0
  168. while [ ${total_proc} -gt 0 ]
  169. do
  170. if [ ${current_hostid} -ge ${#hosts[@]} ]
  171. then
  172. echo "Not enough computing nodes"
  173. exit 1
  174. fi
  175. current_hostname=${hosts[$current_hostid]}
  176. while [[ ${total_proc} -gt 0 && ${count} -lt ${!ppn} ]]
  177. do
  178. echo ${hosts[$current_hostid]} >> machinefile
  179. let "count += 1"
  180. let "processes_hosts[$current_hostid] += 1"
  181. let "total_proc -= 1" || true
  182. done
  183. if [ ${count} -eq ${!ppn} ]
  184. then
  185. let "current_hostid += 1"
  186. machinefile_find_available_node
  187. count=0
  188. fi
  189. done
  190. fi
  191. done
  192. }
  193. function launch()
  194. {
  195. cmd="mpirun"
  196. if [ "$use_machinefile" = "true" ]
  197. then
  198. cmd="mpirun -machinefile machinefile"
  199. machinefile_init
  200. fi
  201. while (( "$#" ))
  202. do
  203. # Get number of MPI ranks and executable name
  204. nranks=$1
  205. executable=./$(basename $2)
  206. if [ "$use_machinefile" = "true" ]
  207. then
  208. machinefile_add $executable $nranks
  209. fi
  210. shift
  211. shift
  212. cmd+=" -np $nranks $executable"
  213. # Add any arguments to executable
  214. while (( "$#" )) && [ "$1" != "--" ]
  215. do
  216. cmd+=" $1"
  217. shift
  218. done
  219. shift || true
  220. # Add colon of more executables follow
  221. (( "$#" )) && cmd+=" :"
  222. done
  223. #export OMP_NUM_THREADS=1
  224. #export OMP_NUM_THREADS=1
  225. #export I_MPI_ADJUST_BCAST=3
  226. #export PSM2_MTU=8196
  227. #export PSM2_MEMORY=large
  228. #export PSM2_MQ_RNDV_HFI_THRESH=1
  229. #export I_MPI_DEBUG=5
  230. #export I_MPI_FABRIC=tmi
  231. pwd
  232. echo $cmd
  233. #exit
  234. $cmd
  235. }
  236. function finalise()
  237. {
  238. # This function should execute of any post run functionality, e.g.
  239. # platform dependent cleaning or a resubmit
  240. if ${resubmit_job} && [ $(date -d "${leg_end_date}" +%s) -lt $(date -d "${run_end_date}" +%s) ]
  241. then
  242. info "Resubmitting job for leg $((leg_number+1))"
  243. # go to submit dir
  244. cd ${PBS_O_WORKDIR} # same as ${start_dir}
  245. # So use sed instead
  246. log=$(basename ${stdout_file}).$(printf %03d $((leg_number+1)))
  247. cp ./${PBS_JOBNAME} ./${PBS_JOBNAME}.$$
  248. sed "s:#PBS -o out/.*:#PBS -o out/${log}:" \
  249. <./${PBS_JOBNAME}.$$ \
  250. >./${PBS_JOBNAME}
  251. cp -f ./${PBS_JOBNAME} ./${PBS_JOBNAME}.$$
  252. sed "s:special_restart=true:special_restart=false:" \
  253. <./${PBS_JOBNAME}.$$ \
  254. >./${PBS_JOBNAME}
  255. \rm -f ./${PBS_JOBNAME}.$$
  256. # Submit command
  257. set -x
  258. qsub ./${PBS_JOBNAME}
  259. set +x
  260. else
  261. info "Not resubmitting."
  262. fi
  263. }