123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- # Platform dependent configuration functions for the 'HPC2020' cluster
- # (ECMWF, BOLOGNA)
- function configure()
- {
- # This function should configure all settings/modules needed to
- # later prepare the EC-Earth run directory and set variables used
- # in the run script
- # Configure paths for building/running EC-Earth
- ecearth_src_dir=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]]
- run_dir=[[[PLT:ACTIVE:RUN_DIR]]]
- ini_data_dir=[[[PLT:ACTIVE:INI_DATA_DIR]]]
- # File for standard output.
- # NOTE: This will be modified for restart jobs!
- stdout_file=${start_dir}/out/[[[MOD:GENERAL:EXP_NAME]]].$$.out
- # Resubmit this job for automatic restarts? [true/false]
- # Also, add options for the resubmit command here.
- resubmit_job=[[[PLT:ACTIVE:RESUBMIT_JOB]]]
- resubmit_opt="[[[PLT:ACTIVE:RESUBMIT_OPT]]]"
- # Configure grib api paths
- export ECCODES_DEFINITION_PATH=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]]/util/grib_table_126:[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_DEFINITION_SUBDIR]]]
- export ECCODES_SAMPLES_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_SAMPLES_SUBDIR]]]
- export GRIB_BIN_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_BIN_SUBDIR]]]
- # Configure number of processors per node
- proc_per_node=[[[PLT:ACTIVE:PROC_PER_NODE]]]
- # Configure and load modules
- pre_load_modules_cmd="[[[PLT:ACTIVE:PRE_LOAD_MODULES_CMD]]]"
- module_list="[[[PLT:ACTIVE:MODULE_LIST]]]"
- if [ -n "${module_list}" ]
- then
- set +u
- if [ -n "${pre_load_modules_cmd}" ]
- then
- ${pre_load_modules_cmd}
- fi
- for m in "${module_list}"
- do
- module load $m
- done
- set -u
- fi
-
- # Add directories to the shared library search path
- if [ -n "[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]" ]
- then
- export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}"[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]"
- fi
- # Use machinefiles or not
- [[ `echo "$use_machinefile" | tr '[:upper:]' '[:lower:]'` == true ]] && use_machinefile=true || use_machinefile=false
- export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1}
- ulimit -s unlimited
- ulimit -n 2048
- ulimit -c unlimited
- ulimit -a
- }
- machinefile_config()
- {
- # LPJG is memory hungry (limit is somewhere between 24 and 32 cores per node)
- lpjg_exc=true
- lpjg_maxppn=10
- # Let only nemo/xios/runoff_mapper share nodes:
- tm5_exc=true
- ifs_exc=true
-
- # # Enforced C-cycle configurations
- # if has_config ifs nemo pisces rnfmapper xios lpjg
- # then
- # if ! has_config tm5
- # then
- # # c-driven
- # ...
- # else
- # # e-driven
- # ...
- # fi
- # fi
- }
- machinefile_init()
- {
- # Get max processes per node from the platform variable
- max_ppn=$proc_per_node
- components=( ifs nem xio rnf amip lpjg tm5 )
-
- # Default to sharing nodes and to using between 1 and all procs on a node
- for component in ${components[@]}
- do
- eval ${component}_exc=false
- eval ${component}_minppn=1
- eval ${component}_maxppn=$max_ppn
- done
-
- # Call user configuration
- machinefile_config
- # List of hosts allocated for the current job
- hosts=(`scontrol show hostname | paste -s`)
- nhosts=${#hosts[@]}
- nhosts0=$(( ${#hosts[@]} - 1 ))
- hosts[$nhosts]=dummy
-
- # Array to store processes as they are assigned
- declare -a -g processes_hosts
- declare -Ag has_exclusive
- for n in `seq 0 $nhosts`
- do
- processes_hosts[$n]=0
- has_exclusive[${hosts[$n]}]=false
- done
- > machinefile
- }
- find_empty_node()
- {
- current_hostid=0
- while (( ${processes_hosts[$current_hostid]} > 0 ))
- do
- (( current_hostid += 1 ))
- done
- }
- find_unfilled_shared_node()
- {
- current_hostid=0
- h=${hosts[$current_hostid]}
- while (( ${processes_hosts[$current_hostid]} + ${!minppn} > $max_ppn )) || ${has_exclusive[$h]}
- do
- (( current_hostid += 1 ))
- h=${hosts[$current_hostid]}
- done
- }
- machinefile_add()
- {
- total_proc=$2
-
- # Iterate through all the possible binaries
- for component in ${components[@]}
- do
- binary="${component}_exe_file"
- exclusive="${component}_exc"
- minppn="${component}_minppn"
- maxppn="${component}_maxppn"
-
- # Check if the current binary matches the input executable
- if [ ./$(basename ${!binary}) = "$1" ]
- then
-
- # Allocate up to maxppn cores in each of subsequent nodes till there are no more process to assign
- while [ ${total_proc} -gt 0 ]
- do
- ${!exclusive} && find_empty_node || find_unfilled_shared_node
- [[ ${current_hostid} -gt $nhosts0 ]] && error "Not enough computing nodes"
- current_hostname=${hosts[$current_hostid]}
- nodecount=${processes_hosts[$current_hostid]}
- modelcount=0
- ${!exclusive} && has_exclusive[$current_hostname]=true
-
- while [[ ${total_proc} -gt 0 && $modelcount -lt ${!maxppn} && $nodecount -lt $max_ppn ]]
- do
- echo ${hosts[$current_hostid]} >> machinefile
- (( modelcount += 1 ))
- (( nodecount += 1 ))
- let "processes_hosts[$current_hostid] += 1"
- let "total_proc -= 1" || true
- done
- done
- fi
- done
- }
- check_used_nodes()
- {
- local id nnodes
- nnodes=0
- for id in `seq 0 $nhosts0`
- do
- (( ${processes_hosts[$id]} > 0 )) && (( nnodes+=1 ))
- done
-
- if (( $SLURM_NNODES > $nnodes ))
- then
- error "Too many NODES allocated, resubmit with only $nnodes nodes"
- fi
- }
- launch()
- {
- # version using srun with or without machinefile
- if $use_machinefile
- then
- machinefile_init
- export SLURM_HOSTFILE=machinefile
- fi
-
- rm -f conf.txt
- _task1=-1
- NBTASKS=0
- while (( "$#" ))
- do
- nranks=$1
- executable=./$(basename $2)
- shift
- shift
- $use_machinefile && machinefile_add $executable $nranks
-
- _task0=$((_task1+1))
- _task1=$((_task0+nranks-1))
- cmd="${_task0}-${_task1} ${executable}"
- NBTASKS=$((NBTASKS+nranks))
-
- while (( "$#" )) && [ "$1" != "--" ]
- do
- cmd+=" $1"
- shift
- done
- echo ${cmd} >>conf.txt
- shift || true
- done
-
- echo '-------A-conf.txt------'
- cat conf.txt
- echo '-------E-conf.txt------'
- if $use_machinefile
- then
- echo '-------A-machinefile------'
- cat machinefile
- echo '-------E-machinefile------'
- check_used_nodes
- fi
-
- cmd="srun --kill-on-bad-exit=1"
- CONF_FILE=conf.txt
- echo "$cmd --ntasks=$NBTASKS -l --multi-prog $CONF_FILE"
- /usr/bin/time $cmd --ntasks=$NBTASKS -l --multi-prog $CONF_FILE
- }
- finalise()
- {
- # This function should execute of any post run functionality, e.g.
- # platform dependent cleaning or a resubmit
- if ${resubmit_job} && [ $(date -d "${leg_end_date}" +%s) -lt $(date -d "${run_end_date}" +%s) ]
- then
- info "Resubmitting job for leg $((leg_number+1))"
-
- # Need to go to start_dir to find the run script
- cd ${start_dir}
- mkdir -p out
- unset SLURM_HOSTFILE
- # ReSubmit the same script, overwritting only the log filename.
- # Note: This does not work if you specify a job name with sbatch -J jobname!
- sbatch \
- -o out/${exp_name}.$(printf %03d $((leg_number+1))) \
- -e out/${exp_name}.$(printf %03d $((leg_number+1))) \
- ${resubmit_opt} \
- ./${SLURM_JOB_NAME}
- fi
- }
- postprocess()
- {
- local cwd=$PWD
- local islast=$(( $(date -ud "${leg_end_date}" +%s) == $(date -ud "${run_end_date}" +%s) ))
- local mess prev_leg_nb model
- # Trigger the cmorization of the previous leg and current leg if this is the last one.
- cd WHERE-SCRIPT-TO-TRIGGER-ECE2CMOR3-RESIDES #ADAPT
- if (( leg_number > 1 )) || (( islast ))
- then
- prev_leg_nb=$(( leg_number - 1 ))
- for model in ifs nemo tm5 lpjg
- do
- if [ ${prev_leg_nb} -gt 0 ]
- then
- mess=${exp_name}-${model}-$(printf %03d ${prev_leg_nb})
- info "Submit cmorization $mess"
- sbatch --output=log/${exp_name}/${mess}.out --job-name=$mess SUBCMORSCRIPT.sh ${exp_name} ${model} ${prev_leg_nb} #ADAPT
- fi
- if (( islast ))
- then
- mess=${exp_name}-${model}-$(printf %03d ${leg_number})
- info "Submit cmorization $mess"
- sbatch --output=log/${exp_name}/${mess}.out --job-name=$mess SUBCMORSCRIPT.sh ${exp_name} ${model} ${leg_number} #ADAPT
- fi
- done
- fi
- cd ${cwd}
- }
|