123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475 |
- #!/bin/bash
- #
- # Job options
- #
- #SBATCH --job-name=NE4_00
- #SBATCH --time=12:00:00
- #SBATCH --account=ecearth
- #
- #SBATCH --nodes=6
- #SBATCH --exclusive
- ##SBATCH --ntasks=1320
- #SBATCH --ntasks-per-node=100
- #SBATCH --partition=batch
- #
- set -ueo pipefail
- #
- LOCAL_NODES=6
- LOCAL_TASKS=600
- #
- stdout_file=${SLURM_SUBMIT_DIR-$PWD}/${SLURM_JOB_NAME-"local"}_${SLURM_JOB_ID-"id"}.log
- exec > ${stdout_file}
- echo "------------------ Job Info --------------------"
- echo "jobid : ${SLURM_JOB_ID-"id"}"
- echo "jobname : ${SLURM_JOB_NAME-"local"}"
- echo "nodename : ${SLURMD_NODENAME-"nlocal"}"
- echo "# nodes : ${SLURM_JOB_NUM_NODES-$LOCAL_NODES}"
- echo "# tasks : ${SLURM_NTASKS-$LOCAL_TASKS}"
- echo "submit dir : ${SLURM_SUBMIT_DIR-$PWD}"
- set -ue
- #
- # Cluster variables
- #
- NB_CORES_PER_NODES=128
- MAX_CORES_PER_NODES=100
- LIST_CORES_SOCKET=`seq -s',' 0 $((NB_CORES_PER_NODES-1))`
- #
- # Experiment options
- #
- exp_name=NE4_00
- run_start_date="1979-01-01"
- run_duration="1 year"
- rst_freq="1 month"
- run_num_legs=12
- special_restart=false
- special_restart_from=EXP0
- special_restart_date="1959-01-01"
- # ORCA025=1350 - ORCA1=2700 - ORCA2=5400
- # 1-23360 1-11680 1-5840
- nem_time_step_sec=1350
- lim_time_step_sec=1350
- nem_restart_offset=0
- nem_config_name=ORCA025_ICE
- info_file="nemo.info"
- start_dir=${SLURM_SUBMIT_DIR-$PWD}
- run_dir="/gpfs/scratch/acad/ecearth/$USER/nemo/run/${exp_name}"
- archive_dir="/gpfs/scratch/acad/ecearth/$USER/nemo/archive/${exp_name}"
- #
- # Program configuration
- #
- #192 - 230 - 460 - 1150
- #debug nem_numproc=360
- nem_numproc=592
- xio_numproc=8
- #debug xio_numproc=24
- #4 - 4 - 6 - 14 (max 26)
- nemo_src_dir=${HOME}/modeles/nemo_4.2.0
- shared_dir=${nemo_src_dir}/cfgs/SHARED
- nem_exe=nemo.exe
- nem_exe_file=${start_dir}/../BLD/bin/nemo.exe
- xio_exe=xios_server.exe
- xio_exe_file=${nemo_src_dir}/ext/xios-trunk-2482_gnu/bin/xios_server.exe
- all_proc=$(($nem_numproc+$xio_numproc))
- if [[ "${SLURM_JOB_NAME-"local"}" != "local" ]] ; then
- if (( $all_proc != ${SLURM_NTASKS-$LOCAL_TASKS} ))
- then
- echo "XIOS procs + NEMOC procs do not fit with SLURM requirements."
- #exit 0
- fi
- fi
- #
- # Data configuration
- #
- nem_grid=ORCA025L121
- #
- ini_data_dir=/gpfs/scratch/acad/ecearth/pbarriat/data/nemo
- #
- ic_subdir=initial
- ic_files=(
- "Goutorbe_ghflux.nc"
- "eORCA025_ghflux_v2.0_c3.0_weights_bilin_nohls.nc => weights_ghflux_bilinear.nc"
- "eORCA025_iwm_b0.2_v1.0_nohls.nc => zdfiwm_forcing.nc"
- "eORCA025.L121_domain_cfg_b0.5_c3.0_d1.0_nohls_clean.nc => domain_cfg.nc"
- "eORCA025_runoff_b0.2_v0.0_nohls.nc => runoff.nc"
- "eORCA025_calving_b0.2_v2.3_nohls.nc => calving.nc"
- "eORCA025_ttv_b0.2_v0.0_nohls.nc => boost_tidal_velocity.nc"
- "eORCA025_bfr2d_v0.2_nohls.nc => bfr_coef.nc"
- "eORCA025_shlat2d_v0.2_nohls.nc => shlat2d.nc"
- "eORCA025_distcoast_b0.2_v0.0_nohls.nc => distcoast.nc"
- "eORCA025.L121-empc_nohls.nc => empc.nc"
- )
- #
- nem_res_hor=$(echo ${nem_grid} | sed 's:ORCA\([0-9]\+\)L[0-9]\+:\1:')
- #
- clim_subdir=climatology
- clim_files=(
- "eORCA025.L121_WOA2018_c3.0_d1.0_v19812010.5.2_nohls.nc => woce_monthly_init.nc"
- "chlorophyl_v0.0.nc => chlorophyl.nc"
- "eORCA025_chlorophyl_v0.0_c3.0_weights_bilin_nohls.nc => chlorophyl_weights_bilin.nc"
- "eORCA025_sss_WOA2018_c3.0_v19812010.5.1_nohls.nc => sss_absolute_salinity.nc"
- "eORCA025_seaice_c3.0_v19802004.0_nohls.nc => seaice.nc"
- )
- #
- forcing_subdir=forcing
- #nem_forcing_set=ERA5
- nem_forcing_set=JRA55
- forcing_files=(
- "* => ."
- )
- #
- shared_files=(
- "namelist_ice_ref"
- "namelist_ref"
- "domain_def_nemo.xml"
- "axis_def_nemo.xml"
- "field_def_nemo-ice.xml"
- "field_def_nemo-oce.xml"
- "grid_def_nemo.xml"
- )
- #
- # Script logic
- #
- function leap_days()
- {
- local ld=0
- local frstYYYY=$(date -ud "$1" +%Y)
- local lastYYYY=$(date -ud "$2" +%Y)
- set +e
- $(date -ud "${frstYYYY}-02-29" > /dev/null 2>&1) \
- && (( $(date -ud "$1" +%s) < $(date -ud "${frstYYYY}-03-01" +%s) )) \
- && (( $(date -ud "$2" +%s) > $(date -ud "${lastYYYY}-02-28" +%s) )) \
- && (( ld++ ))
- for (( y=(( ${frstYYYY}+1 )); y<=(( ${lastYYYY}-1 )); y++ ))
- do
- $(date -ud "$y-02-29" > /dev/null 2>&1) && (( ld++ ))
- done
- (( $lastYYYY > $frstYYYY )) \
- && $(date -ud "${lastYYYY}-02-29" > /dev/null 2>&1) \
- && (( $(date -ud "$1" +%s) < $(date -ud "${frstYYYY}-03-01" +%s) )) \
- && (( $(date -ud "$2" +%s) > $(date -ud "${lastYYYY}-02-28" +%s) )) \
- && (( ld++ ))
- set -e
- echo "$ld"
- }
- [[ $@ == *verbose* ]] && set -x
- #module purge
- module load craype-x86-milan
- module load PrgEnv-gnu/8.3.3
- module load netCDF-Fortran/4.6.0-gompi-2022a
- module load Perl/.5.34.1-GCCcore-11.3.0
- if [ ! -d ${run_dir:?} ]
- then
- mkdir -p ${run_dir}
- #
- if $special_restart
- then
- rsync -av --delete ${run_dir}/../${special_restart_from}/ --exclude log --exclude output --exclude restart --exclude="${special_restart_from}_*" --exclude="ocean*" --exclude="restart_*" --exclude="debug.*" --exclude="output.*" ${run_dir}
- cp -f ${nem_exe_file} ${run_dir}
- cp -f ${xio_exe_file} ${run_dir}
- special_year=${special_restart_date:0:4}
- sed -i "/$special_year/q" ${run_dir}/${info_file}
- . ${run_dir}/${info_file}
- special_restart_leg=$(printf %03d $((leg_number+1)))
- cd ${run_dir}/../../archive/${special_restart_from}/restart/${special_restart_leg}
- for f in *.nc; do
- nf=${exp_name}${f:4}
- cp $f ${run_dir}/$nf
- done
- cd -
- cd ${run_dir}
- for f in ${exp_name}_????????_restart_???_????.nc; do
- nf=${f:14}
- ln -s $f $nf
- done
- cd -
- fi
- cd ${start_dir}
- cp context_nemo.xml file_def_nemo-ice.xml file_def_nemo-oce.xml iodef.xml namelist_ice_cfg* build_namelist_cfg* ${run_dir}
- cd ${run_dir}
- cp ${xio_exe_file} ${xio_exe}
- cp ${nem_exe_file} ${nem_exe}
-
- [[ ! -f EMPave_old.dat ]] && echo " 0 0.0000000000000000E+00 0.0000000000000000E+00" > EMPave_old.dat
- for file in "${ic_files[@]}"; do
- [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${ic_subdir}/${nem_grid}/$file")
- done
- for file in "${ic_files[@]}"; do
- [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${ic_subdir}/$file")
- done
- for file in "${clim_files[@]}"; do
- [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${clim_subdir}/${nem_grid}/$file")
- done
- for file in "${clim_files[@]}"; do
- [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${clim_subdir}/$file")
- done
- for file in "${forcing_files[@]}"; do
- [[ ! -e ${file#*> } || "$file" == \** ]] && ln -sf $(sed 's/ *=> */ /' <<< "${ini_data_dir}/${forcing_subdir}/${nem_forcing_set}/$file")
- done
- for file in "${shared_files[@]}"; do
- [[ ! -e ${file#*> } ]] && ln -sf $(sed 's/ *=> */ /' <<< "${shared_dir}/$file")
- done
- else
- cd ${run_dir}
- shopt -s nullglob
- for v in grid_U grid_V grid_W grid_T icemod SBC SBC_scalar diaptr2D diaptr3D
- do
- for f in ${exp_name}_??_????????_????????_${v}_????.nc
- do
- rm -f "$f"
- done
- for f in ${exp_name}_??_????????_????????_${v}.nc
- do
- rm -f "$f"
- done
- for f in ${exp_name}_??_${v}.nc
- do
- rm -f "$f"
- done
- done
- for f in ocean.output time.step ; do rm -f "${f}"; done
- shopt -u nullglob
- fi
- run_start_date=$(date -uR -d "${run_start_date}")
- run_end_date="${run_start_date} + ${run_duration:?}"
- run_end_date=$(date -uR -d "${run_end_date}")
- run_start_epoch=$(date -u -d"${run_start_date}" +%s)
- run_end_epoch=$(date -u -d"${run_end_date}" +%s)
- for (( ; run_num_legs>0 ; run_num_legs-- ))
- do
- [[ -r "${info_file:?}" ]] && source "${info_file:?}"
- leg_start_date=${leg_end_date:-$run_start_date}
- leg_number=$((${leg_number:=0}+1))
- leg_start_epoch=$(date -u -d "${leg_start_date}" +%s)
- leg_end_epoch=$(date -u -d "${leg_start_date:?} + ${rst_freq:=$run_duration}" +%s)
- leg_end_date=$(date -uR -d@"${leg_end_epoch}")
- leg_length_sec=$(( leg_end_epoch - leg_start_epoch ))
- leg_start_sec=$(( leg_start_epoch - run_start_epoch ))
- leg_end_sec=$(( leg_end_epoch - run_start_epoch ))
- leg_start_date_yyyymmdd=$(date -u -d "${leg_start_date}" +%Y%m%d)
- leg_length_sec=$(( leg_length_sec - $(leap_days "${leg_start_date}" "${leg_end_date}")*24*3600 ))
- leg_start_sec=$(( leg_start_sec - $(leap_days "${run_start_date}" "${leg_start_date}")*24*3600 ))
- leg_end_sec=$(( leg_end_sec - $(leap_days "${run_start_date}" "${leg_end_date}")*24*3600 ))
- (( leg_number > 1 )) && leg_is_restart=true || leg_is_restart=false
- (( leg_end_epoch > run_end_epoch )) && leg_end_date=${run_end_epoch}
- if (( leg_start_epoch >= run_end_epoch ))
- then
- echo "Leg start date equal to or after end of simulation."
- echo "Nothing left to do. Cleaning and exiting."
- for (( n=0 ; n<nem_numproc ; n++ ))
- do
- np=$(printf %04d ${n})
- rm -f "restart_oce_${np}.nc"
- rm -f "restart_ice_${np}.nc"
- rm -f "restart_icb_${np}.nc"
- done
- exit 0
- fi
- source build_namelist_cfg.sh > namelist_cfg
- ns=$(printf %08d $(( leg_start_sec / nem_time_step_sec - nem_restart_offset )))
- echo "ns=$ns"
- if ((leg_start_sec > 0 )); then
- for (( n=0 ; n<nem_numproc ; n++ ))
- do
- np=$(printf %04d ${n})
- formatted_leg_number=$(printf %03d $((leg_number)))
- [[ -f "${exp_name:?}_${ns}_restart_oce_${np}.nc" ]] || { cp $archive_dir/restart/${formatted_leg_number}/*oce* . ; }
- [[ -f "${exp_name:?}_${ns}_restart_oce_${np}.nc" ]] || { echo "Error: restart file not found." ; exit 2 ; }
- ln -fs "${exp_name:?}_${ns}_restart_oce_${np}.nc" "restart_oce_${np}.nc"
- [[ -f "${exp_name:?}_${ns}_restart_ice_${np}.nc" ]] || { cp $archive_dir/restart/${formatted_leg_number}/*ice* . ; }
- [[ -f "${exp_name:?}_${ns}_restart_ice_${np}.nc" ]] || { echo "Error: restart file not found." ; exit 2 ; }
- ln -fs "${exp_name:?}_${ns}_restart_ice_${np}.nc" "restart_ice_${np}.nc"
- [[ -f "${exp_name:?}_${ns}_restart_icb_${np}.nc" ]] || { cp $archive_dir/restart/${formatted_leg_number}/*icb* . ; }
- [[ -f "${exp_name:?}_${ns}_restart_icb_${np}.nc" ]] || { echo "Error: restart file not found." ; exit 2 ; }
- ln -fs "${exp_name:?}_${ns}_restart_icb_${np}.nc" "restart_icb_${np}.nc"
- done
- fi
- [[ $@ == *preponly* ]] && exit 0
- time_begin=$(date +%s)
- ulimit -s unlimited
- if [[ "${SLURM_JOB_NAME-"local"}" == "local" ]] ; then
- echo "!!! Local RUN !!!"
- #xio_numproc=2
- #nem_numproc=24
- fi
- #
- echo "run dir : $run_dir"
- echo "leg_number : $leg_number"
- #echo "ulimit -s unlimited"
- #echo "Lemaitre3-2018: I_MPI_FABRICS=tcp mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}"
- #echo "Lemaitre3>2019: I_MPI_FABRICS=ofi FI_PROVIDER=tcp mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}"
- #echo "Nic5: I_MPI_HYDRA_TOPOLIB=ipl I_MPI_FABRICS=ofi mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}"
- #echo "Zenobe: mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}"
- #echo "Cyclone: I_MPI_FABRICS=tcp mpirun -np "${xio_numproc:?}" "./${xio_exe:?}" : -np "${nem_numproc:?}" "./${nem_exe:?}"
- #echo "LUMI: srun --multi-prog prog.conf (SLURM_JOB_NUM_NODES:${SLURM_JOB_NUM_NODES-$LOCAL_NODES} SLURM_CPUS_ON_NODE:${SLURM_CPUS_ON_NODE-$NB_CORES_PER_NODES})"
- echo "LUCIA: srun srun_wrapper.sh (SLURM_JOB_NUM_NODES:${SLURM_JOB_NUM_NODES-$LOCAL_NODES} SLURM_CPUS_ON_NODE:${SLURM_CPUS_ON_NODE-$NB_CORES_PER_NODES})"
- export OMP_NUM_THREADS=1
- #export MKL_NUM_THREADS=1
- #export PMI_NO_PREINITIALIZE=y
- export TIME="launch timing : %e elapsed %U user %S system"
- # Split XIOS nodes
- #cat /dev/null > prog.conf
- #nem_numproc_slice=$(($nem_numproc/${SLURM_JOB_NUM_NODES-$LOCAL_NODES}))
- #nem_numproc_slice_0=$(($nem_numproc_slice-1))
- #xio_numproc_slice_0=$(($xio_numproc/${SLURM_JOB_NUM_NODES-$LOCAL_NODES}-1))
- #xio_numproc_slice=$(($nem_numproc_slice+$xio_numproc_slice_0))
- #proc_id=0
- #for i in $(eval echo "{1..${SLURM_JOB_NUM_NODES-$LOCAL_NODES}}")
- #do
- # for j in $(eval echo "{0..$nem_numproc_slice_0}")
- # do
- # echo "$proc_id ./${nem_exe}" >> prog.conf
- # proc_id=$(($proc_id+1))
- # done
- # for j in $(eval echo "{$nem_numproc_slice..$xio_numproc_slice}")
- # do
- # echo "$proc_id ./${xio_exe}" >> prog.conf
- # proc_id=$(($proc_id+1))
- # done
- #done
- # Group XIOS nodes
- cat /dev/null > prog.conf
- proc_id=0
- for i in $(eval echo "{1..${nem_numproc}}")
- do
- echo "$proc_id ./${nem_exe}" >> prog.conf
- proc_id=$(($proc_id+1))
- done
- for i in $(eval echo "{1..${xio_numproc}}")
- do
- echo "$proc_id ./${xio_exe}" >> prog.conf
- proc_id=$(($proc_id+1))
- done
- #echo "LUMI: srun --kill-on-bad-exit=1 --multi-prog prog.conf"
- #cat /dev/null > ./ztask_file.conf
- #echo "0-$(($xio_numproc-1)) ./${xio_exe}" >> ./ztask_file.conf
- #echo "$xio_numproc-$(($xio_numproc+$nem_numproc-1)) ./${nem_exe}" >> ./ztask_file.conf
- #BINDING=map_cpu:$LIST_CORES_SOCKET
- #echo "LUMI: srun --kill-on-bad-exit=1 --mpi=pmi2 -m cyclic --cpu_bind=$BINDING --multi-prog ./ztask_file.conf"
- #exit
- echo $time_begin
- #mpirun -np ${xio_numproc} ./${xio_exe} : -np ${nem_numproc} ./${nem_exe}
- srun --kill-on-bad-exit=1 --multi-prog prog.conf
- #srun --kill-on-bad-exit=1 --mpi=pmi2 -m cyclic --cpu_bind=$BINDING --multi-prog ./ztask_file.conf
- #srun --kill-on-bad-exit=1 ./${xio_exe} : ./${nem_exe}
- time_end=$(date +%s)
- echo $time_end
- formatted_leg_number=$(printf %03d $((leg_number)))
- outdir="${archive_dir:?}/output/${formatted_leg_number}"
- mkdir -p "${outdir}"
- shopt -s nullglob
- for v in grid_U grid_V grid_W grid_T icemod SBC SBC_scalar diaptr2D diaptr3D
- do
- for f in ${exp_name}_??_????????_????????_${v}_????.nc
- do
- mv "$f" "$outdir/"
- done
- for f in ${exp_name}_??_????????_????????_${v}.nc
- do
- mv "$f" "$outdir/"
- done
- for f in ${exp_name}_??_${v}.nc
- do
- mv "$f" "$outdir/"
- done
- done
- outdir="$archive_dir/restart/${formatted_leg_number}"
- mkdir -p "${outdir}"
- for f in ${exp_name}_${ns}_restart_???_????.nc
- do
- [ -f "$f" ] && mv "$f" "${outdir}"
- done
- outdir="$archive_dir/log/${formatted_leg_number}"
- mkdir -p "${outdir}"
- for f in ocean.output time.step ; do mv "${f}" "${outdir}"; done
- cp -f namelist_ice_ref namelist_ice_cfg namelist_ref namelist_cfg ${archive_dir}
- [[ -f ${start_dir}/${SLURM_JOB_NAME-"run"}.sh ]] && cp -f ${start_dir}/${SLURM_JOB_NAME-"run"}.sh ${archive_dir}
- shopt -u nullglob
- tr=$(date -d "0 -$time_begin sec + $time_end sec" +%T)
- current_date=$(date +'%F %T')
- {
- echo "#"
- echo "# Finished leg at ${current_date} after ${tr} (hh:mm:ss)"
- echo "leg_number=${leg_number}"
- echo "leg_start_date=\"${leg_start_date}\""
- echo "leg_end_date=\"${leg_end_date}\""
- } | tee -a "${info_file}"
- special_restart=false
- done
- cd - >/dev/null
- [[ $@ == *noresubmit* ]] && exit 0
- if (( leg_end_epoch < run_end_epoch )) ; then
- echo "Leg end earlier than end of simulation."
- echo "Submitting another job."
- if [[ "$@" == *"run"* ]] ; then
- exec "$0" "$@"
- elif hash sbatch 2>/dev/null; then
- # Need to go to start_dir to find the run script
- cd ${start_dir}
- echo "sbatch -N ${SLURM_JOB_NUM_NODES-"1"} -o ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) -e ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) -d ${SLURM_JOB_ID-"id"} ./${SLURM_JOB_NAME-"run"}.sh"
- # Submit command
- # Note: This does not work if you specify a job name with sbatch -J jobname!
- sbatch -N ${SLURM_JOB_NUM_NODES-"1"} \
- -o ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \
- -e ${run_dir}/$(basename ${stdout_file}).$(printf %03d $((leg_number+1))) \
- -d ${SLURM_JOB_ID-"id"} \
- ./${SLURM_JOB_NAME-"run"}.sh
- #
- else
- cd ${start_dir}
- echo "qsub ${PBS_JOBNAME}.sh"
- qsub ./${PBS_JOBNAME}.sh
- fi
- else
- echo "Nothing left to do. Cleaning and exiting." # FIXME Factorize this (we have two exit points)
- for (( n=0 ; n<nem_numproc ; n++ ))
- do
- np=$(printf %04d ${n})
- rm -f "restart_oce_${np}.nc"
- rm -f "restart_ice_${np}.nc"
- rm -f "restart_icb_${np}.nc"
- done
- fi
- exit 0
|