#!/bin/bash # # Job options # #SBATCH --job-name=NE4_00 #SBATCH --time=12:00:00 #SBATCH --account=ecearth # #SBATCH --nodes=6 #SBATCH --exclusive ##SBATCH --ntasks=1320 #SBATCH --ntasks-per-node=100 #SBATCH --partition=batch # set -ueo pipefail # LOCAL_NODES=6 LOCAL_TASKS=600 # stdout_file=${SLURM_SUBMIT_DIR-$PWD}/${SLURM_JOB_NAME-"local"}_${SLURM_JOB_ID-"id"}.log exec > ${stdout_file} echo "------------------ Job Info --------------------" echo "jobid : ${SLURM_JOB_ID-"id"}" echo "jobname : ${SLURM_JOB_NAME-"local"}" echo "nodename : ${SLURMD_NODENAME-"nlocal"}" echo "# nodes : ${SLURM_JOB_NUM_NODES-$LOCAL_NODES}" echo "# tasks : ${SLURM_NTASKS-$LOCAL_TASKS}" echo "submit dir : ${SLURM_SUBMIT_DIR-$PWD}" set -ue # # Cluster variables # NB_CORES_PER_NODES=128 MAX_CORES_PER_NODES=100 LIST_CORES_SOCKET=`seq -s',' 0 $((NB_CORES_PER_NODES-1))` # # Experiment options # exp_name=NE4_00 run_start_date="1979-01-01" run_duration="1 year" rst_freq="1 month" run_num_legs=12 special_restart=false special_restart_from=EXP0 special_restart_date="1959-01-01" # ORCA025=1350 - ORCA1=2700 - ORCA2=5400 # 1-23360 1-11680 1-5840 nem_time_step_sec=1350 lim_time_step_sec=1350 nem_restart_offset=0 nem_config_name=ORCA025_ICE info_file="nemo.info" start_dir=${SLURM_SUBMIT_DIR-$PWD} run_dir="/gpfs/scratch/acad/ecearth/$USER/nemo/run/${exp_name}" archive_dir="/gpfs/scratch/acad/ecearth/$USER/nemo/archive/${exp_name}" # # Program configuration # #192 - 230 - 460 - 1150 #debug nem_numproc=360 nem_numproc=592 xio_numproc=8 #debug xio_numproc=24 #4 - 4 - 6 - 14 (max 26) nemo_src_dir=${HOME}/modeles/nemo_4.2.0 shared_dir=${nemo_src_dir}/cfgs/SHARED nem_exe=nemo.exe nem_exe_file=${start_dir}/../BLD/bin/nemo.exe xio_exe=xios_server.exe xio_exe_file=${nemo_src_dir}/ext/xios-trunk-2482_gnu/bin/xios_server.exe all_proc=$(($nem_numproc+$xio_numproc)) if [[ "${SLURM_JOB_NAME-"local"}" != "local" ]] ; then if (( $all_proc != ${SLURM_NTASKS-$LOCAL_TASKS} )) then echo "XIOS procs + NEMOC procs do not fit with SLURM requirements." #exit 0 fi fi # # Data configuration # nem_grid=ORCA025L121 # ini_data_dir=/gpfs/scratch/acad/ecearth/pbarriat/data/nemo # ic_subdir=initial ic_files=( "Goutorbe_ghflux.nc" "eORCA025_ghflux_v2.0_c3.0_weights_bilin_nohls.nc => weights_ghflux_bilinear.nc" "eORCA025_iwm_b0.2_v1.0_nohls.nc => zdfiwm_forcing.nc" "eORCA025.L121_domain_cfg_b0.5_c3.0_d1.0_nohls_clean.nc => domain_cfg.nc" "eORCA025_runoff_b0.2_v0.0_nohls.nc => runoff.nc" "eORCA025_calving_b0.2_v2.3_nohls.nc => calving.nc" "eORCA025_ttv_b0.2_v0.0_nohls.nc => boost_tidal_velocity.nc" "eORCA025_bfr2d_v0.2_nohls.nc => bfr_coef.nc" "eORCA025_shlat2d_v0.2_nohls.nc => shlat2d.nc" "eORCA025_distcoast_b0.2_v0.0_nohls.nc => distcoast.nc" "eORCA025.L121-empc_nohls.nc => empc.nc" ) # nem_res_hor=$(echo ${nem_grid} | sed 's:ORCA\([0-9]\+\)L[0-9]\+:\1:') # clim_subdir=climatology clim_files=( "eORCA025.L121_WOA2018_c3.0_d1.0_v19812010.5.2_nohls.nc => woce_monthly_init.nc" "chlorophyl_v0.0.nc => chlorophyl.nc" "eORCA025_chlorophyl_v0.0_c3.0_weights_bilin_nohls.nc => chlorophyl_weights_bilin.nc" "eORCA025_sss_WOA2018_c3.0_v19812010.5.1_nohls.nc => sss_absolute_salinity.nc" "eORCA025_seaice_c3.0_v19802004.0_nohls.nc => seaice.nc" ) # forcing_subdir=forcing #nem_forcing_set=ERA5 nem_forcing_set=JRA55 forcing_files=( "* => ." ) # shared_files=( "namelist_ice_ref" "namelist_ref" "domain_def_nemo.xml" "axis_def_nemo.xml" "field_def_nemo-ice.xml" "field_def_nemo-oce.xml" "grid_def_nemo.xml" ) # # Script logic # function leap_days() { local ld=0 local frstYYYY=$(date -ud "$1" +%Y) local lastYYYY=$(date -ud "$2" +%Y) set +e $(date -ud "${frstYYYY}-02-29" > /dev/null 2>&1) \ && (( $(date -ud "$1" +%s) < $(date -ud "${frstYYYY}-03-01" +%s) )) \ && (( $(date -ud "$2" +%s) > $(date -ud "${lastYYYY}-02-28" +%s) )) \ && (( ld++ )) for (( y=(( ${frstYYYY}+1 )); y<=(( ${lastYYYY}-1 )); y++ )) do $(date -ud "$y-02-29" > /dev/null 2>&1) && (( ld++ )) done (( $lastYYYY > $frstYYYY )) \ && $(date -ud "${lastYYYY}-02-29" > /dev/null 2>&1) \ && (( $(date -ud "$1" +%s) < $(date -ud "${frstYYYY}-03-01" +%s) )) \ && (( $(date -ud "$2" +%s) > $(date -ud "${lastYYYY}-02-28" +%s) )) \ && (( ld++ )) set -e echo "$ld" } [[ $@ == *verbose* ]] && set -x #module purge module load craype-x86-milan module load PrgEnv-gnu/8.3.3 module load netCDF-Fortran/4.6.0-gompi-2022a module load Perl/.5.34.1-GCCcore-11.3.0 if [ ! -d ${run_dir:?} ] then mkdir -p ${run_dir} # if $special_restart then rsync -av --delete ${run_dir}/../${special_restart_from}/ --exclude log --exclude output --exclude restart --exclude="${special_restart_from}_*" --exclude="ocean*" --exclude="restart_*" --exclude="debug.*" --exclude="output.*" ${run_dir} cp -f ${nem_exe_file} ${run_dir} cp -f ${xio_exe_file} ${run_dir} special_year=${special_restart_date:0:4} sed -i "/$special_year/q" ${run_dir}/${info_file} . ${run_dir}/${info_file} special_restart_leg=$(printf %03d $((leg_number+1))) cd ${run_dir}/../../archive/${special_restart_from}/restart/${special_restart_leg} for f in *.nc; do nf=${exp_name}${f:4} cp $f ${run_dir}/$nf done cd - cd ${run_dir} for f in ${exp_name}_????????_restart_???_????.nc; do nf=${f:14} ln -s $f $nf done cd - fi cd ${start_dir} cp context_nemo.xml file_def_nemo-ice.xml file_def_nemo-oce.xml iodef.xml namelist_ice_cfg* build_namelist_cfg* ${run_dir} cd ${run_dir} cp ${xio_exe_file} ${xio_exe} cp ${nem_exe_file} ${nem_exe} [[ ! -f EMPave_old.dat ]] && echo " 0 0.0000000000000000E+00 0.0000000000000000E+00" > EMPave_old.dat for file in "${ic_files[@]}"; do [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${ic_subdir}/${nem_grid}/$file") done for file in "${ic_files[@]}"; do [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${ic_subdir}/$file") done for file in "${clim_files[@]}"; do [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${clim_subdir}/${nem_grid}/$file") done for file in "${clim_files[@]}"; do [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${clim_subdir}/$file") done for file in "${forcing_files[@]}"; do [[ ! -e ${file#*> } || "$file" == \** ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${forcing_subdir}/${nem_forcing_set}/$file") done for file in "${shared_files[@]}"; do [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${shared_dir}/$file") done else cd ${run_dir} shopt -s nullglob for v in grid_U grid_V grid_W grid_T icemod SBC SBC_scalar diaptr2D diaptr3D do for f in ${exp_name}_??_????????_????????_${v}_????.nc do rm -f "$f" done for f in ${exp_name}_??_????????_????????_${v}.nc do rm -f "$f" done for f in ${exp_name}_??_${v}.nc do rm -f "$f" done done for f in ocean.output time.step ; do rm -f "${f}"; done shopt -u nullglob fi run_start_date=$(date -uR -d "${run_start_date}") run_end_date="${run_start_date} + ${run_duration:?}" run_end_date=$(date -uR -d "${run_end_date}") run_start_epoch=$(date -u -d"${run_start_date}" +%s) run_end_epoch=$(date -u -d"${run_end_date}" +%s) for (( ; run_num_legs>0 ; run_num_legs-- )) do [[ -r "${info_file:?}" ]] && source "${info_file:?}" leg_start_date=${leg_end_date:-$run_start_date} leg_number=$((${leg_number:=0}+1)) leg_start_epoch=$(date -u -d "${leg_start_date}" +%s) leg_end_epoch=$(date -u -d "${leg_start_date:?} + ${rst_freq:=$run_duration}" +%s) leg_end_date=$(date -uR -d@"${leg_end_epoch}") leg_length_sec=$(( leg_end_epoch - leg_start_epoch )) leg_start_sec=$(( leg_start_epoch - run_start_epoch )) leg_end_sec=$(( leg_end_epoch - run_start_epoch )) leg_start_date_yyyymmdd=$(date -u -d "${leg_start_date}" +%Y%m%d) leg_length_sec=$(( leg_length_sec - $(leap_days "${leg_start_date}" "${leg_end_date}")*24*3600 )) leg_start_sec=$(( leg_start_sec - $(leap_days "${run_start_date}" "${leg_start_date}")*24*3600 )) leg_end_sec=$(( leg_end_sec - $(leap_days "${run_start_date}" "${leg_end_date}")*24*3600 )) (( leg_number > 1 )) && leg_is_restart=true || leg_is_restart=false (( leg_end_epoch > run_end_epoch )) && leg_end_date=${run_end_epoch} if (( leg_start_epoch >= run_end_epoch )) then echo "Leg start date equal to or after end of simulation." echo "Nothing left to do. Cleaning and exiting." for (( n=0 ; n namelist_cfg ns=$(printf %08d $(( leg_start_sec / nem_time_step_sec - nem_restart_offset ))) echo "ns=$ns" if ((leg_start_sec > 0 )); then for (( n=0 ; n2019: I_MPI_FABRICS=ofi FI_PROVIDER=tcp mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}" #echo "Nic5: I_MPI_HYDRA_TOPOLIB=ipl I_MPI_FABRICS=ofi mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}" #echo "Zenobe: mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}" #echo "Cyclone: I_MPI_FABRICS=tcp mpirun -np "${xio_numproc:?}" "./${xio_exe:?}" : -np "${nem_numproc:?}" "./${nem_exe:?}" #echo "LUMI: srun --multi-prog prog.conf (SLURM_JOB_NUM_NODES:${SLURM_JOB_NUM_NODES-$LOCAL_NODES} SLURM_CPUS_ON_NODE:${SLURM_CPUS_ON_NODE-$NB_CORES_PER_NODES})" echo "LUCIA: srun srun_wrapper.sh (SLURM_JOB_NUM_NODES:${SLURM_JOB_NUM_NODES-$LOCAL_NODES} SLURM_CPUS_ON_NODE:${SLURM_CPUS_ON_NODE-$NB_CORES_PER_NODES})" export OMP_NUM_THREADS=1 #export MKL_NUM_THREADS=1 #export PMI_NO_PREINITIALIZE=y export TIME="launch timing : %e elapsed %U user %S system" # Split XIOS nodes #cat /dev/null > prog.conf #nem_numproc_slice=$(($nem_numproc/${SLURM_JOB_NUM_NODES-$LOCAL_NODES})) #nem_numproc_slice_0=$(($nem_numproc_slice-1)) #xio_numproc_slice_0=$(($xio_numproc/${SLURM_JOB_NUM_NODES-$LOCAL_NODES}-1)) #xio_numproc_slice=$(($nem_numproc_slice+$xio_numproc_slice_0)) #proc_id=0 #for i in $(eval echo "{1..${SLURM_JOB_NUM_NODES-$LOCAL_NODES}}") #do # for j in $(eval echo "{0..$nem_numproc_slice_0}") # do # echo "$proc_id ./${nem_exe}" >> prog.conf # proc_id=$(($proc_id+1)) # done # for j in $(eval echo "{$nem_numproc_slice..$xio_numproc_slice}") # do # echo "$proc_id ./${xio_exe}" >> prog.conf # proc_id=$(($proc_id+1)) # done #done # Group XIOS nodes cat /dev/null > prog.conf proc_id=0 for i in $(eval echo "{1..${nem_numproc}}") do echo "$proc_id ./${nem_exe}" >> prog.conf proc_id=$(($proc_id+1)) done for i in $(eval echo "{1..${xio_numproc}}") do echo "$proc_id ./${xio_exe}" >> prog.conf proc_id=$(($proc_id+1)) done #echo "LUMI: srun --kill-on-bad-exit=1 --multi-prog prog.conf" #cat /dev/null > ./ztask_file.conf #echo "0-$(($xio_numproc-1)) ./${xio_exe}" >> ./ztask_file.conf #echo "$xio_numproc-$(($xio_numproc+$nem_numproc-1)) ./${nem_exe}" >> ./ztask_file.conf #BINDING=map_cpu:$LIST_CORES_SOCKET #echo "LUMI: srun --kill-on-bad-exit=1 --mpi=pmi2 -m cyclic --cpu_bind=$BINDING --multi-prog ./ztask_file.conf" #exit echo $time_begin #mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe} srun --kill-on-bad-exit=1 --multi-prog prog.conf #srun --kill-on-bad-exit=1 --mpi=pmi2 -m cyclic --cpu_bind=$BINDING --multi-prog ./ztask_file.conf #srun --kill-on-bad-exit=1 ./${xio_exe} : ./${nem_exe} time_end=$(date +%s) echo $time_end formatted_leg_number=$(printf %03d $((leg_number))) outdir="${archive_dir:?}/output/${formatted_leg_number}" mkdir -p "${outdir}" shopt -s nullglob for v in grid_U grid_V grid_W grid_T icemod SBC SBC_scalar diaptr2D diaptr3D do for f in ${exp_name}_??_????????_????????_${v}_????.nc do mv "$f" "$outdir/" done for f in ${exp_name}_??_????????_????????_${v}.nc do mv "$f" "$outdir/" done for f in ${exp_name}_??_${v}.nc do mv "$f" "$outdir/" done done outdir="$archive_dir/restart/${formatted_leg_number}" mkdir -p "${outdir}" for f in ${exp_name}_${ns}_restart_???_????.nc do [ -f "$f" ] && mv "$f" "${outdir}" done outdir="$archive_dir/log/${formatted_leg_number}" mkdir -p "${outdir}" for f in ocean.output time.step ; do mv "${f}" "${outdir}"; done cp -f namelist_ice_ref namelist_ice_cfg namelist_ref namelist_cfg ${archive_dir} [[ -f ${start_dir}/${SLURM_JOB_NAME-"run"}.sh ]] && cp -f ${start_dir}/${SLURM_JOB_NAME-"run"}.sh ${archive_dir} shopt -u nullglob tr=$(date -d "0 -$time_begin sec + $time_end sec" +%T) current_date=$(date +'%F %T') { echo "#" echo "# Finished leg at ${current_date} after ${tr} (hh:mm:ss)" echo "leg_number=${leg_number}" echo "leg_start_date=\"${leg_start_date}\"" echo "leg_end_date=\"${leg_end_date}\"" } | tee -a "${info_file}" special_restart=false done cd - >/dev/null [[ $@ == *noresubmit* ]] && exit 0 if (( leg_end_epoch < run_end_epoch )) ; then echo "Leg end earlier than end of simulation." echo "Submitting another job." if [[ "$@" == *"run"* ]] ; then exec "$0" "$@" elif hash sbatch 2>/dev/null; then # Need to go to start_dir to find the run script cd ${start_dir} echo "sbatch -N ${SLURM_JOB_NUM_NODES-"1"} -o ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) -e ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) -d ${SLURM_JOB_ID-"id"} ./${SLURM_JOB_NAME-"run"}.sh" # Submit command # Note: This does not work if you specify a job name with sbatch -J jobname! sbatch -N ${SLURM_JOB_NUM_NODES-"1"} \ -o ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \ -e ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \ -d ${SLURM_JOB_ID-"id"} \ ./${SLURM_JOB_NAME-"run"}.sh # else cd ${start_dir} echo "qsub ${PBS_JOBNAME}.sh" qsub ./${PBS_JOBNAME}.sh fi else echo "Nothing left to do. Cleaning and exiting." # FIXME Factorize this (we have two exit points) for (( n=0 ; n