#! /bin/bash # # Coral - collaborative job submission script manager for NEMO # output_script="run.sh" CORAL_HOME="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ; export CORAL_HOME HOSTNAME="$(hostname -s)" ; export HOSTNAME FQDN=$(hostname) ; export FQDN USER_EMAIL=$(getent passwd "${USER}" | egrep -o '[a-zA-Z0-9_-.]*@[]*\.[a-z]+') ; export USER_EMAIL MAINTAINER_EMAIL="damien.francois@uclouvain.be" ; export MAINTAINER_EMAIL source "${CORAL_HOME}"/utils/tools.inc init() { doc "$@" <<EOT init [<template name>] - Initialize current directory with files from template <template name> Templates contain configuration files, namelists, and xios configurations. By default, the template name is the machine short name (as given by hostname -s) EOT local template_name=${1:-${HOSTNAME}} local template_path=${CORAL_HOME}/templates/${template_name} [[ ! -d ${template_path} ]] && die 1 "Error: Template '${template_name}' not found." [[ -f experiment.cfg ]] && die 2 "Error: Directory not empty. Remove all *.cfg files." [[ -d namelists ]] && die 2 "Error: Directory not empty. Remove directory namelists." [[ -d xios_config ]] && die 2 "Error: Directory not empty. Remove directory xios_config." [[ -f ${output_script} ]] && die 2 "Error: Directory not empty. Remove ${output_script}" echo -e "Importing '${template_name}' template in the current directory..." < "${template_path}"/../files.txt column -t -s\| cp "${template_path}"/*cfg . cp -r "${template_path}"/xios_config . cp -r "${template_path}"/namelists . echo "${template_name}" > .coral_template echo -e "Done. Modify them to your needs and then run 'coral build' to build and" echo -e "review the submission script and 'coral submit' to submit the job." } build() { doc "$@" <<EOT build - Build submission script from the configuraiton files in the current directory. EOT [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first" echo -e "Generating job submission script '${output_script}'..." echo "#!/bin/bash" >${output_script} ; ( set -e source experiment.cfg export exp_name export nem_config_name source programs.cfg export nem_numproc export xio_numproc export total_nb_cpu=$((nem_numproc + xio_numproc)) export email=$USER_EMAIL submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command echo -e "\n#\n# Job options\n#" <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu echo -e "\n#\n# Experiment options\n#" <experiment.cfg egrep -v "(^\#|^$)" echo -e "\n#\n# Data configuration\n#" <data.cfg cat echo -e "\n#\n# Program configuration\n#" <programs.cfg egrep -v "(^\#|^$)" echo -e "\n#\n# Script logic\n#" <"${CORAL_HOME}"/templates/scripts/skeleton.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command ) >>${output_script} 2>/dev/null \ || die 3 "Error: Syntax error in configuration files. Please review them." echo -e "Done. Run 'coral submit' to submit the job, or submit it manually." } submit() { doc "$@" <<EOT submit [<options>] - Submit the job based on the submission script present in the current directory. options can include: * local: run the script locally rather than submitting it to the job scheduler * preponly: run only the part of the script that prepares the run (copies files, links data, etc.) * noresubmit: run only one job and prevent submission of continuation jobs. * fromscratch: restart computations from scratch ignoring checkpoint files. Use with care. * verbose: show everything during submission EOT [[ -f ${output_script} ]] || die 1 "Error: submission script not found. Run 'coral build' first" echo Making script executable and submitting it... chmod +x ${output_script} ( set -e source experiment.cfg eval "$(grep run_dir job.cfg)" [[ "$@" == *fromscratch* ]] && rm -rf "${run_dir:?}" [[ "$@" == *local* ]] && { ./"${output_script}" "$@" ; exit 0; } which sbatch &> /dev/null && { jobid="$(sbatch "${output_script}" "$@")" sleep 2 jobid=${jobid##* } mkdir -p "${run_dir:?}" echo "${jobid}" >> "${run_dir}"/.coral_jobs ; #squeue --start -j "${jobid}" # FIXME squeue -j "${jobid}" exit 0; } which qsub &> /dev/null && { if [[ $USER == "vsc"* ]]; then credits=$@ if [ -z "$credits" ]; then credits="laerocloud" fi mkdir -p tmp echo ${output_script} > tmp/script_name echo ${PWD} > tmp/address qsub -A $credits "${output_script}" | tee -a coral_jobs; sleep 2 jobid=`cat coral_jobs` rm -f coral_jobs jobid=${jobid%%.*} mkdir -p "${run_dir:?}" echo "${jobid}" >> "${run_dir}"/.coral_jobs ; qstat else qsub -v PBS_OPTIONS="$@" "${output_script}" | tee -a coral_jobs; sleep 2 jobid=`cat coral_jobs` rm -f coral_jobs jobid=${jobid%.*} mkdir -p "${run_dir:?}" echo "${jobid}" >> "${run_dir}"/.coral_jobs ; qstat -J "${jobid}" fi exit 0; } ) echo "Done." } save() { doc "$@" <<EOT save [<name>] - Save configuration files in the current template or create a new template if <name> is given. The new template can consequently be used with 'coral init <name>' in another directory' EOT local target local current_template local template local target current_template=$(<.coral_template) template=${1-$current_template} target=$CORAL_HOME/templates/$template [[ -z ${template} ]] && die 1 "Error: No template name defined." [[ -f experiment.cfg ]] || die 1 "Error: No configuration files found." [[ -d namelists ]] || die 1 "Error: namelist directory not found." [[ -d xios_config ]] || die 1 "Error: xios_config directory not found." echo "Saving template ${1-$current_template} to ${target}..." mkdir -p "${target}" rsync -q -va -- *.cfg namelists xios_config "${target}" echo "${template}" > .coral_template echo Done. } share() { doc "$@" <<EOT share [<name>] - Make template files for template '<name>' world readable and display instructions on how to import the template.' EOT local curr local target local current_template [[ -f .coral_template ]] || die 1 "No coral template found in current directory." current_template=$(<.coral_template) template=${1-$current_template} target=$CORAL_HOME/templates/${template} [[ -d ${target} ]] || save "${template}" echo "Making sure the template is world readable..." chmod o+rx "${target}" export curr=${target} while curr="$(dirname "${curr}")" ; do chmod o+x "${curr}" [[ "${curr}" == "$HOME" || "${curr}" == / ]] && break ; done echo -e "Done. Other users can now use template '${template}' by issuing \n coral import $target" } import() { doc "$@" <<EOT import <path> - Import template located at <path> into the local coral installation. EOT [[ -n $1 ]] || die 1 "Usage: $(basename "$0") import <path>" local template_name template_name=$(basename "$1") echo "Importing template in $1..." rsync -q -va "$1" "$CORAL_HOME"/templates echo "Done. You can test it with 'coral init ${template_name}'" } publish() { doc "$@" <<EOT publish [<template>] - Submits template to the central repository manager for inclusion in the main coral repository. EOT [[ -f .coral_template ]] || die 1 "Error: No template name defined." local current_template current_template=$(<.coral_template) export template=${1-$current_template} export target=$CORAL_HOME/templates/${template} [[ -d $target ]] || save "${template}" echo "Commiting template to local Hg repository and contacting repository master..." cd "$CORAL_HOME" && ( hg status | grep -q "^?.*$template" && hg -q add "${target}" hg status | grep -q "$template" && hg -q commit "${target}" -m"Commit template ${template}" ) <"${CORAL_HOME}"/templates/scripts/repomaster.txt envsubst \$HOSTNAME,\$FQDN,\$USER,\$CORAL_HOME |\ mail -s "Coral template publish request" $MAINTAINER_EMAIL echo "Email to $MAINTAINER_EMAIL sent." } status() { doc "$@" <<EOT status - Displays the current status of the simulation. EOT [[ -f .coral_jobs ]] || die 2 "Error: no job information found in current directory. Make sure you run 'coral status' in the run directory." local jobid jobid=$(tail -1 .coral_jobs) [[ -f nemo.info ]] && { tail -4 nemo.info } which squeue &> /dev/null && squeue -j "${jobid}" &>/dev/null && { echo "Current job:" squeue -j "${jobid}" } [[ -f time.step ]] && { echo "Current timestep: (hit CTRL-C to stop)" (while [ -f time.step ] ;do echo -en "$(cat time.step)" ; sleep 1 ; echo -en "\e[0K\r" ; done) } } update() { doc "$@" <<EOT update - undocumented. Do not use. EOT ( cd "${CORAL_HOME}" && hg pull --update ; ) } list() { doc "$@" <<EOT list - List all templates available in local coral install EOT ls -l "${CORAL_HOME}"/templates | grep -v files.txt | grep -v scripts } help() { cat <<RTFM Usage: $0 <command> [<arguments>], where command is one of: init - Initialize template in current directory. Default template is the machine name. build - Build submission script from template in current directory. submit - Submit job from submission script in current directory. status - Displays the current status of the simulation. save - Save modifications to template in current directory share - Share template with other users on the same machine import - Import template from other user on the same machine publish - Submit template for inclusing in central coral repository list - List all available templates Use 'coral <command> -h' for more information about a specific command. RTFM } completion() { #FIXME cat <<'EOTCOMPLETION' _coral () # By convention, the function name { #+ starts with an underscore. _get_comp_words_by_ref cur prev words cword _split_long_opt COMPREPLY=( $(compgen -W "init commit submit build edit share save import publish update list status" -- $cur) ) } complete -F _coral coral EOTCOMPLETION } if [[ ! "$1" =~ ^(init|commit|submit|build|edit|share|save|import|publish|update|list|status|completion)$ ]]; then help >&2 exit 1 else "$@" fi