123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- #!/bin/bash
- # Platform dependent configuration functions for the 'zenobe' machine
- #(zenobe.hpc.cenaero.be)
- function configure()
- {
- # This function should configure all settings/modules needed to
- # later prepare the EC-Earth run directory and set variables used
- # in the run script
- # SCRATCH is not defined in MN3, define it here
- # and also make sure it is defined when compiling
- export SCRATCH=/gpfs/scratch/acad/ecearth/${USER}
- # Configure paths for building/running EC-Earth
- ecearth_src_dir=${HOME}/models/ecearth_3.3.4.2/sources
- run_dir=/gpfs/scratch/acad/ecearth/${USER}/ecearth/run/${exp_name}
- ini_data_dir=/gpfs/scratch/acad/ecearth/data/bsc32/v3.3.4/inidata
- archive_dir=/gpfs/scratch/acad/ecearth/${USER}/ecearth/archive/${exp_name}
- # File for standard output.
- # NOTE: This will be modified for restart jobs!
- stdout_file=${SLURM_SUBMIT_DIR-$PWD}/${SLURM_JOB_NAME-"local"}_${SLURM_JOB_ID-"id"}.log
- # Resubmit this job for automatic restarts? [true/false]
- # Also, add options for the resubmit command here.
- resubmit_job=true
- resubmit_opt=""
- module load EasyBuild/2023a
- export MODULEPATH=$MODULEPATH:/gpfs/projects/acad/ecearth/softs/easybuild/2023a/modules/all
- module load netCDF-Fortran/4.6.1-iompi-2023a
- module load CDO/2.2.2-iompi-2023a
- module load NCO/5.1.3-iomkl-2023a
- module load ecCodes/2.31.0-iompi-2023a
- # Configure grib api paths
- export GRIB_DEFINITION_PATH=${HOME}/models/ecearth_3.3.4.2/sources/util/grib_table_126:${EBROOTECCODES}/share/eccodes/definitions
- export GRIB_SAMPLES_PATH=${EBROOTECCODES}/share/eccodes/ifs_samples/grib1
- export GRIB_BIN_PATH=${EBROOTECCODES}/bin
- # Configure number of processors per node
- proc_per_node=128
- # Use machinefiles or not
- [[ `echo "$use_machinefile" | tr '[:upper:]' '[:lower:]'` == true ]] && use_machinefile=true || use_machinefile=false
- ulimit -s unlimited
- # Load specific MPI environment configuration
- configure_mpi
- }
- function configure_python()
- {
- # specific for python+eccodes setup - used for OSM pre/post-processing
- # it would be simple to do the following in configure
- # module load eccodes/2.8.0 python/2.7.13
- module load eccodes/2.8.0 python/2.7.13
- unset GRIB_DEFINITION_PATH
- unset GRIB_SAMPLES_PATH
- unset GRIB_BIN_PATH
- export GRIB_BIN_PATH=/apps/ECCODES/2.8.0/INTEL/bin
- }
- function configure_mpi()
- {
- [ -z "${OMP_NUM_THREADS-}" ] && export OMP_NUM_THREADS=1
- #export I_MPI_DEBUG=5
- #export I_MPI_ADJUST_BCAST=3
- #export PSM2_MTU=8196
- #export PSM2_MEMORY=large
- #export PSM2_MQ_RNDV_HFI_THRESH=1
- #export I_MPI_DEBUG=5
- #export I_MPI_FABRIC=tmi
- #unset I_MPI_PMI_LIBRARY
- #export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
- #export I_MPI_FABRICS=shm:ofi
- }
- function get_hosts()
- {
- # This function uses a scheduler command to get the hosts allocated for the current job
- hosts=(`scontrol show hostname | paste -s`)
- }
- function machinefile_config()
- {
- # User configuration starts here
- # hard-coded c4mip configurations, must use the proper _numproc settings
- if has_config ifs nemo pisces rnfmapper xios lpjg ; then
- if ! has_config tm5 ; then
- ifs_ppn=48 ; [[ ${ifs_numproc} != 336 ]] && info "wrong numproc setting for ifs in machinefile_config" || true
- nem_ppn=43 ; [[ ${nem_numproc} != 380 ]] && info "wrong numproc setting for nemo in machinefile_config" || true
- xio_ppn=5 ; [[ ${xio_numproc} != 5 ]] && info "wrong numproc setting for xios in machinefile_config" || true
- lpjg_ppn=5 ; [[ ${lpjg_numproc} != 40 ]] && info "wrong numproc setting for lpjg in machinefile_config" || true
- else
- ifs_ppn=48 ; [[ ${ifs_numproc} != 256 ]] && info "wrong numproc setting for ifs in machinefile_config" || true
- nem_ppn=46 ; [[ ${nem_numproc} != 192 ]] && info "wrong numproc setting for nemo in machinefile_config" || true
- xio_ppn=2 ; [[ ${xio_numproc} != 2 ]] && info "wrong numproc setting for xios in machinefile_config" || true
- lpjg_ppn=2 ; [[ ${lpjg_numproc} != 8 ]] && info "wrong numproc setting for lpjg in machinefile_config" || true
- tm5_ppn=4 ; [[ ${tm5_numproc} != 4 ]] && info "wrong numproc setting for tm5 in machinefile_config" || true
- fi
- else
- # Add any new exclusive binary here
- ifs_exc=TRUE
- nem_exc=TRUE
- xio_exc=TRUE
- lpjg_exc=TRUE
- tm5_exc=TRUE
- # Modify the allocation to each binary using more than one process here
- ifs_ppn=48
- nem_ppn=48
- xio_ppn=48
- lpjg_ppn=48
- tm5_ppn=45
- fi
- }
- function machinefile_init()
- {
- # Get max processes per node from the platform variable
- max_ppn=$proc_per_node
- components=( ifs nem xio rnf amip lpjg )
- if $(has_config tm5)
- then
- components=( "${components[@]}" "tm5" )
- fi
-
- for component in ${components[@]}
- do
- eval ${component}_exc=FALSE
- eval ${component}_ppn=1
- done
-
- # Call user configuration and get_host functions
- machinefile_config
- get_hosts
-
- # Declare array to store the processes as they are assigned
- declare -a -g processes_hosts
- for n in `seq 0 ${#hosts[@]}`
- do
- processes_hosts[$n]=0
- done
- > machinefile
-
- current_hostid=0
- }
- machinefile_find_available_node()
- {
- while [ $((${processes_hosts[$current_hostid]} + ${!ppn})) -gt $max_ppn ]
- do
- let "current_hostid += 1"
- done
- }
- machinefile_add()
- {
- total_proc=$2
- # Iterate through all the possible binaries
- for component in ${components[@]}
- do
- binary="${component}_exe_file"
- exclusive="${component}_exc"
- # Check if the current binary matches the input executable
- if [ ./$(basename ${!binary}) = "$1" ]
- then
- ppn="${component}_ppn"
- # Exclusive mode: start allocation at the first empty node
- if [[ ${!exclusive} == "TRUE" ]]
- then
- while [ ${processes_hosts[$current_hostid]} -gt 0 ]
- do
- let "current_hostid += 1"
- done
- # Shared mode: start allocation in the first node with enough free cores
- # Notice that only the first node is checked
- # Then, if a previous binary had "exc=TRUE", allocation space is not ensure in subsequent nodes
- else
- current_hostid=0
- machinefile_find_available_node
- fi
- # Allocate ppn cores in each of the subsequent nodes till there are no more processes to assign
- count=0
- while [ ${total_proc} -gt 0 ]
- do
- if [ ${current_hostid} -ge ${#hosts[@]} ]
- then
- echo "Not enough computing nodes"
- exit 1
- fi
- current_hostname=${hosts[$current_hostid]}
- while [[ ${total_proc} -gt 0 && ${count} -lt ${!ppn} ]]
- do
- echo ${hosts[$current_hostid]} >> machinefile
- let "count += 1"
- let "processes_hosts[$current_hostid] += 1"
- let "total_proc -= 1" || true
- done
- if [ ${count} -eq ${!ppn} ]
- then
- let "current_hostid += 1"
- machinefile_find_available_node
- count=0
- fi
- done
- fi
- done
- }
- function launch()
- {
- # Compute and check the node distribution
- info "======================="
- info "Node/proc distribution:"
- info "-----------------------"
- info "IFS: ${ifs_numproc}"
- info "NEMO: ${nem_numproc}"
- info "XIOS: ${xio_numproc}"
- info "RUNOFF: ${rnf_numproc}"
- info "======================="
- cmd="mpirun"
- cat /dev/null > prog.conf
- proc_id=0
- if [ "$use_machinefile" = "true" ]
- then
- cmd="mpirun -machinefile machinefile"
- machinefile_init
- fi
- while (( "$#" ))
- do
- # Get number of MPI ranks and executable name
- nranks=$1
- executable=./$(basename $2)
-
- if [ "$use_machinefile" = "true" ]
- then
- machinefile_add $executable $nranks
- fi
- shift
- shift
- cmd+=" -n $nranks $executable"
- # Add any arguments to executable
- while (( "$#" )) && [ "$1" != "--" ]
- do
- cmd+=" $1"
- shift
- done
- shift || true
- for i in $(eval echo "{1..${nranks}}")
- do
- echo "$proc_id ${executable}" >> prog.conf
- proc_id=$(($proc_id+1))
- done
- # Add colon of more executables follow
- (( "$#" )) && cmd+=" :"
- done
- #cmd="srun --kill-on-bad-exit=1 --multi-prog prog.conf"
- pwd
- echo $cmd
- #exit
- $cmd
- }
- function finalise()
- {
- # This function should execute of any post run functionality, e.g.
- # platform dependent cleaning or a resubmit
- if ${resubmit_job} && [ $(date -d "${leg_end_date}" +%s) -lt $(date -d "${run_end_date}" +%s) ]
- then
- info "Resubmitting job for leg $((leg_number+1))"
- # Need to go to start_dir to find the run script
- cd ${start_dir}
- # Submit command
- # Note: This does not work if you specify a job name with sbatch -J jobname!
- sbatch -N ${SLURM_JOB_NUM_NODES} \
- -o ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \
- -e ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \
- -d ${SLURM_JOB_ID} \
- ${resubmit_opt} \
- ./${SLURM_JOB_NAME}
- fi
- }
|