# Platform dependent configuration functions for the 'HPC2020' cluster # (ECMWF, BOLOGNA) function configure() { # This function should configure all settings/modules needed to # later prepare the EC-Earth run directory and set variables used # in the run script # Configure paths for building/running EC-Earth ecearth_src_dir=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]] run_dir=[[[PLT:ACTIVE:RUN_DIR]]] ini_data_dir=[[[PLT:ACTIVE:INI_DATA_DIR]]] # File for standard output. # NOTE: This will be modified for restart jobs! stdout_file=${start_dir}/out/[[[MOD:GENERAL:EXP_NAME]]].$$.out # Resubmit this job for automatic restarts? [true/false] # Also, add options for the resubmit command here. resubmit_job=[[[PLT:ACTIVE:RESUBMIT_JOB]]] resubmit_opt="[[[PLT:ACTIVE:RESUBMIT_OPT]]]" # Configure grib api paths export ECCODES_DEFINITION_PATH=[[[PLT:ACTIVE:ECEARTH_SRC_DIR]]]/util/grib_table_126:[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_DEFINITION_SUBDIR]]] export ECCODES_SAMPLES_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_SAMPLES_SUBDIR]]] export GRIB_BIN_PATH=[[[PLT:ACTIVE:GRIBAPI_BASE_DIR]]]/[[[PLT:ACTIVE:GRIBAPI_BIN_SUBDIR]]] # Configure number of processors per node proc_per_node=[[[PLT:ACTIVE:PROC_PER_NODE]]] # Configure and load modules pre_load_modules_cmd="[[[PLT:ACTIVE:PRE_LOAD_MODULES_CMD]]]" module_list="[[[PLT:ACTIVE:MODULE_LIST]]]" if [ -n "${module_list}" ] then set +u if [ -n "${pre_load_modules_cmd}" ] then ${pre_load_modules_cmd} fi for m in "${module_list}" do module load $m done set -u fi # Add directories to the shared library search path if [ -n "[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]" ] then export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}"[[[PLT:ACTIVE:ADD_TO_LD_LIBRARY_PATH]]]" fi # Use machinefiles or not [[ `echo "$use_machinefile" | tr '[:upper:]' '[:lower:]'` == true ]] && use_machinefile=true || use_machinefile=false export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK:-1} ulimit -s unlimited ulimit -n 2048 ulimit -c unlimited ulimit -a } machinefile_config() { # LPJG is memory hungry (limit is somewhere between 24 and 32 cores per node) lpjg_exc=true lpjg_maxppn=10 # Let only nemo/xios/runoff_mapper share nodes: tm5_exc=true ifs_exc=true # # Enforced C-cycle configurations # if has_config ifs nemo pisces rnfmapper xios lpjg # then # if ! has_config tm5 # then # # c-driven # ... # else # # e-driven # ... # fi # fi } machinefile_init() { # Get max processes per node from the platform variable max_ppn=$proc_per_node components=( ifs nem xio rnf amip lpjg tm5 ) # Default to sharing nodes and to using between 1 and all procs on a node for component in ${components[@]} do eval ${component}_exc=false eval ${component}_minppn=1 eval ${component}_maxppn=$max_ppn done # Call user configuration machinefile_config # List of hosts allocated for the current job hosts=(`scontrol show hostname | paste -s`) nhosts=${#hosts[@]} nhosts0=$(( ${#hosts[@]} - 1 )) hosts[$nhosts]=dummy # Array to store processes as they are assigned declare -a -g processes_hosts declare -Ag has_exclusive for n in `seq 0 $nhosts` do processes_hosts[$n]=0 has_exclusive[${hosts[$n]}]=false done > machinefile } find_empty_node() { current_hostid=0 while (( ${processes_hosts[$current_hostid]} > 0 )) do (( current_hostid += 1 )) done } find_unfilled_shared_node() { current_hostid=0 h=${hosts[$current_hostid]} while (( ${processes_hosts[$current_hostid]} + ${!minppn} > $max_ppn )) || ${has_exclusive[$h]} do (( current_hostid += 1 )) h=${hosts[$current_hostid]} done } machinefile_add() { total_proc=$2 # Iterate through all the possible binaries for component in ${components[@]} do binary="${component}_exe_file" exclusive="${component}_exc" minppn="${component}_minppn" maxppn="${component}_maxppn" # Check if the current binary matches the input executable if [ ./$(basename ${!binary}) = "$1" ] then # Allocate up to maxppn cores in each of subsequent nodes till there are no more process to assign while [ ${total_proc} -gt 0 ] do ${!exclusive} && find_empty_node || find_unfilled_shared_node [[ ${current_hostid} -gt $nhosts0 ]] && error "Not enough computing nodes" current_hostname=${hosts[$current_hostid]} nodecount=${processes_hosts[$current_hostid]} modelcount=0 ${!exclusive} && has_exclusive[$current_hostname]=true while [[ ${total_proc} -gt 0 && $modelcount -lt ${!maxppn} && $nodecount -lt $max_ppn ]] do echo ${hosts[$current_hostid]} >> machinefile (( modelcount += 1 )) (( nodecount += 1 )) let "processes_hosts[$current_hostid] += 1" let "total_proc -= 1" || true done done fi done } check_used_nodes() { local id nnodes nnodes=0 for id in `seq 0 $nhosts0` do (( ${processes_hosts[$id]} > 0 )) && (( nnodes+=1 )) done if (( $SLURM_NNODES > $nnodes )) then error "Too many NODES allocated, resubmit with only $nnodes nodes" fi } launch() { # version using srun with or without machinefile if $use_machinefile then machinefile_init export SLURM_HOSTFILE=machinefile fi rm -f conf.txt _task1=-1 NBTASKS=0 while (( "$#" )) do nranks=$1 executable=./$(basename $2) shift shift $use_machinefile && machinefile_add $executable $nranks _task0=$((_task1+1)) _task1=$((_task0+nranks-1)) cmd="${_task0}-${_task1} ${executable}" NBTASKS=$((NBTASKS+nranks)) while (( "$#" )) && [ "$1" != "--" ] do cmd+=" $1" shift done echo ${cmd} >>conf.txt shift || true done echo '-------A-conf.txt------' cat conf.txt echo '-------E-conf.txt------' if $use_machinefile then echo '-------A-machinefile------' cat machinefile echo '-------E-machinefile------' check_used_nodes fi cmd="srun --kill-on-bad-exit=1" CONF_FILE=conf.txt echo "$cmd --ntasks=$NBTASKS -l --multi-prog $CONF_FILE" /usr/bin/time $cmd --ntasks=$NBTASKS -l --multi-prog $CONF_FILE } finalise() { # This function should execute of any post run functionality, e.g. # platform dependent cleaning or a resubmit if ${resubmit_job} && [ $(date -d "${leg_end_date}" +%s) -lt $(date -d "${run_end_date}" +%s) ] then info "Resubmitting job for leg $((leg_number+1))" # Need to go to start_dir to find the run script cd ${start_dir} mkdir -p out unset SLURM_HOSTFILE # ReSubmit the same script, overwritting only the log filename. # Note: This does not work if you specify a job name with sbatch -J jobname! sbatch \ -o out/${exp_name}.$(printf %03d $((leg_number+1))) \ -e out/${exp_name}.$(printf %03d $((leg_number+1))) \ ${resubmit_opt} \ ./${SLURM_JOB_NAME} fi } postprocess() { local cwd=$PWD local islast=$(( $(date -ud "${leg_end_date}" +%s) == $(date -ud "${run_end_date}" +%s) )) local mess prev_leg_nb model # Trigger the cmorization of the previous leg and current leg if this is the last one. cd WHERE-SCRIPT-TO-TRIGGER-ECE2CMOR3-RESIDES #ADAPT if (( leg_number > 1 )) || (( islast )) then prev_leg_nb=$(( leg_number - 1 )) for model in ifs nemo tm5 lpjg do if [ ${prev_leg_nb} -gt 0 ] then mess=${exp_name}-${model}-$(printf %03d ${prev_leg_nb}) info "Submit cmorization $mess" sbatch --output=log/${exp_name}/${mess}.out --job-name=$mess SUBCMORSCRIPT.sh ${exp_name} ${model} ${prev_leg_nb} #ADAPT fi if (( islast )) then mess=${exp_name}-${model}-$(printf %03d ${leg_number}) info "Submit cmorization $mess" sbatch --output=log/${exp_name}/${mess}.out --job-name=$mess SUBCMORSCRIPT.sh ${exp_name} ${model} ${leg_number} #ADAPT fi done fi cd ${cwd} }