coral 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #! /bin/bash
  2. #
  3. # Coral - collaborative job submission script manager for NEMO
  4. #
  5. output_script="run.sh"
  6. CORAL_HOME="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ; export CORAL_HOME
  7. HOSTNAME="$(hostname -s)" ; export HOSTNAME
  8. FQDN=$(hostname) ; export FQDN
  9. USER_EMAIL=$(getent passwd "${USER}" | egrep -o '[a-zA-Z0-9_-.]*@[]*\.[a-z]+') ; export USER_EMAIL
  10. MAINTAINER_EMAIL="damien.francois@uclouvain.be" ; export MAINTAINER_EMAIL
  11. source "${CORAL_HOME}"/utils/tools.inc
  12. init() {
  13. doc "$@" <<EOT
  14. init [<template name>] - Initialize current directory with files from template <template name>
  15. Templates contain configuration files, namelists, and xios configurations.
  16. By default, the template name is the machine short name (as given by hostname -s)
  17. EOT
  18. local template_name=${1:-${HOSTNAME}}
  19. local template_path=${CORAL_HOME}/templates/${template_name}
  20. [[ ! -d ${template_path} ]] && die 1 "Error: Template '${template_name}' not found."
  21. [[ -f experiment.cfg ]] && die 2 "Error: Directory not empty. Remove all *.cfg files."
  22. [[ -d namelists ]] && die 2 "Error: Directory not empty. Remove directory namelists."
  23. [[ -d xios_config ]] && die 2 "Error: Directory not empty. Remove directory xios_config."
  24. [[ -f ${output_script} ]] && die 2 "Error: Directory not empty. Remove ${output_script}"
  25. echo -e "Importing '${template_name}' template in the current directory..."
  26. < "${template_path}"/../files.txt column -t -s\|
  27. cp "${template_path}"/*cfg .
  28. cp -r "${template_path}"/xios_config .
  29. cp -r "${template_path}"/namelists .
  30. echo "${template_name}" > .coral_template
  31. echo -e "Done. Modify them to your needs and then run 'coral build' to build and"
  32. echo -e "review the submission script and 'coral submit' to submit the job."
  33. }
  34. build() {
  35. doc "$@" <<EOT
  36. build - Build submission script from the configuraiton files in the current directory.
  37. EOT
  38. [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first"
  39. echo -e "Generating job submission script '${output_script}'..."
  40. echo "#!/bin/bash" >${output_script} ; (
  41. set -e
  42. source experiment.cfg
  43. export exp_name
  44. export nem_config_name
  45. source programs.cfg
  46. export nem_numproc
  47. export xio_numproc
  48. export total_nb_cpu=$((nem_numproc + xio_numproc))
  49. export email=$USER_EMAIL
  50. submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command
  51. echo -e "\n#\n# Job options\n#"
  52. <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu
  53. echo -e "\n#\n# Experiment options\n#"
  54. <experiment.cfg egrep -v "(^\#|^$)"
  55. echo -e "\n#\n# Data configuration\n#"
  56. <data.cfg cat
  57. echo -e "\n#\n# Program configuration\n#"
  58. <programs.cfg egrep -v "(^\#|^$)"
  59. echo -e "\n#\n# Script logic\n#"
  60. <"${CORAL_HOME}"/templates/scripts/skeleton.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command
  61. ) >>${output_script} 2>/dev/null \
  62. || die 3 "Error: Syntax error in configuration files. Please review them."
  63. echo -e "Done. Run 'coral submit' to submit the job, or submit it manually."
  64. }
  65. submit() {
  66. doc "$@" <<EOT
  67. submit [<options>] - Submit the job based on the submission script present in the current directory.
  68. options can include:
  69. * local: run the script locally rather than submitting it to the job scheduler
  70. * preponly: run only the part of the script that prepares the run (copies files, links data, etc.)
  71. * noresubmit: run only one job and prevent submission of continuation jobs.
  72. * fromscratch: restart computations from scratch ignoring checkpoint files. Use with care.
  73. * verbose: show everything during submission
  74. EOT
  75. [[ -f ${output_script} ]] || die 1 "Error: submission script not found. Run 'coral build' first"
  76. echo Making script executable and submitting it...
  77. chmod +x ${output_script}
  78. (
  79. set -e
  80. source experiment.cfg
  81. eval "$(grep run_dir job.cfg)"
  82. [[ "$@" == *fromscratch* ]] && rm -rf "${run_dir:?}"
  83. [[ "$@" == *local* ]] && { ./"${output_script}" "$@" ; exit 0; }
  84. which sbatch &> /dev/null && {
  85. jobid="$(sbatch "${output_script}" "$@")"
  86. sleep 2
  87. jobid=${jobid##* }
  88. mkdir -p "${run_dir:?}"
  89. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  90. #squeue --start -j "${jobid}" # FIXME
  91. squeue -j "${jobid}"
  92. exit 0;
  93. }
  94. which qsub &> /dev/null && {
  95. #qsub "${output_script}" "$@" | tee -a "${run_dir}"/.coral_jobs;
  96. qsub -v PBS_OPTIONS="$@" "${output_script}" | tee -a coral_jobs;
  97. sleep 2
  98. jobid=`cat coral_jobs`
  99. rm -f coral_jobs
  100. jobid=${jobid%.*}
  101. mkdir -p "${run_dir:?}"
  102. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  103. qstat -J "${jobid}"
  104. exit 0;
  105. }
  106. )
  107. echo "Done."
  108. }
  109. save() {
  110. doc "$@" <<EOT
  111. save [<name>] - Save configuration files in the current template or create a new template if <name> is given.
  112. The new template can consequently be used with 'coral init <name>' in another directory'
  113. EOT
  114. local target
  115. local current_template
  116. local template
  117. local target
  118. current_template=$(<.coral_template)
  119. template=${1-$current_template}
  120. target=$CORAL_HOME/templates/$template
  121. [[ -z ${template} ]] && die 1 "Error: No template name defined."
  122. [[ -f experiment.cfg ]] || die 1 "Error: No configuration files found."
  123. [[ -d namelists ]] || die 1 "Error: namelist directory not found."
  124. [[ -d xios_config ]] || die 1 "Error: xios_config directory not found."
  125. echo "Saving template ${1-$current_template} to ${target}..."
  126. mkdir -p "${target}"
  127. rsync -q -va -- *.cfg namelists xios_config "${target}"
  128. echo "${template}" > .coral_template
  129. echo Done.
  130. }
  131. share() {
  132. doc "$@" <<EOT
  133. share [<name>] - Make template files for template '<name>' world readable and display
  134. instructions on how to import the template.'
  135. EOT
  136. local curr
  137. local target
  138. local current_template
  139. [[ -f .coral_template ]] || die 1 "No coral template found in current directory."
  140. current_template=$(<.coral_template)
  141. template=${1-$current_template}
  142. target=$CORAL_HOME/templates/${template}
  143. [[ -d ${target} ]] || save "${template}"
  144. echo "Making sure the template is world readable..."
  145. chmod o+rx "${target}"
  146. export curr=${target}
  147. while curr="$(dirname "${curr}")" ; do
  148. chmod o+x "${curr}"
  149. [[ "${curr}" == "$HOME" || "${curr}" == / ]] && break ;
  150. done
  151. echo -e "Done. Other users can now use template '${template}' by issuing \n coral import $target"
  152. }
  153. import() {
  154. doc "$@" <<EOT
  155. import <path> - Import template located at <path> into the local coral installation.
  156. EOT
  157. [[ -n $1 ]] || die 1 "Usage: $(basename "$0") import <path>"
  158. local template_name
  159. template_name=$(basename "$1")
  160. echo "Importing template in $1..."
  161. rsync -q -va "$1" "$CORAL_HOME"/templates
  162. echo "Done. You can test it with 'coral init ${template_name}'"
  163. }
  164. publish() {
  165. doc "$@" <<EOT
  166. publish [<template>] - Submits template to the central repository manager for inclusion
  167. in the main coral repository.
  168. EOT
  169. [[ -f .coral_template ]] || die 1 "Error: No template name defined."
  170. local current_template
  171. current_template=$(<.coral_template)
  172. export template=${1-$current_template}
  173. export target=$CORAL_HOME/templates/${template}
  174. [[ -d $target ]] || save "${template}"
  175. echo "Commiting template to local Hg repository and contacting repository master..."
  176. cd "$CORAL_HOME" && (
  177. hg status | grep -q "^?.*$template" && hg -q add "${target}"
  178. hg status | grep -q "$template" && hg -q commit "${target}" -m"Commit template ${template}" )
  179. <"${CORAL_HOME}"/templates/scripts/repomaster.txt envsubst \$HOSTNAME,\$FQDN,\$USER,\$CORAL_HOME |\
  180. mail -s "Coral template publish request" $MAINTAINER_EMAIL
  181. echo "Email to $MAINTAINER_EMAIL sent."
  182. }
  183. status() {
  184. doc "$@" <<EOT
  185. status - Displays the current status of the simulation.
  186. EOT
  187. [[ -f .coral_jobs ]] || die 2 "Error: no job information found in current directory. Make sure you run 'coral status' in the run directory."
  188. local jobid
  189. jobid=$(tail -1 .coral_jobs)
  190. [[ -f nemo.info ]] && {
  191. tail -4 nemo.info
  192. }
  193. which squeue &> /dev/null && squeue -j "${jobid}" &>/dev/null && {
  194. echo "Current job:"
  195. squeue -j "${jobid}"
  196. }
  197. [[ -f time.step ]] && {
  198. echo "Current timestep: (hit CTRL-C to stop)"
  199. (while [ -f time.step ] ;do echo -en "$(cat time.step)" ; sleep 1 ; echo -en "\e[0K\r" ; done)
  200. }
  201. }
  202. update() {
  203. doc "$@" <<EOT
  204. update - undocumented. Do not use.
  205. EOT
  206. ( cd "${CORAL_HOME}" && hg pull --update ; )
  207. }
  208. list() {
  209. doc "$@" <<EOT
  210. list - List all templates available in local coral install
  211. EOT
  212. ls -l "${CORAL_HOME}"/templates | grep -v files.txt | grep -v scripts
  213. }
  214. help() {
  215. cat <<RTFM
  216. Usage: $0 <command> [<arguments>], where command is one of:
  217. init - Initialize template in current directory. Default template is the machine name.
  218. build - Build submission script from template in current directory.
  219. submit - Submit job from submission script in current directory.
  220. status - Displays the current status of the simulation.
  221. save - Save modifications to template in current directory
  222. share - Share template with other users on the same machine
  223. import - Import template from other user on the same machine
  224. publish - Submit template for inclusing in central coral repository
  225. list - List all available templates
  226. Use 'coral <command> -h' for more information about a specific command.
  227. RTFM
  228. }
  229. completion() { #FIXME
  230. cat <<'EOTCOMPLETION'
  231. _coral () # By convention, the function name
  232. { #+ starts with an underscore.
  233. _get_comp_words_by_ref cur prev words cword
  234. _split_long_opt
  235. COMPREPLY=( $(compgen -W "init commit submit build edit share save import publish update list status" -- $cur) )
  236. }
  237. complete -F _coral coral
  238. EOTCOMPLETION
  239. }
  240. if [[ ! "$1" =~ ^(init|commit|submit|build|edit|share|save|import|publish|update|list|status|completion)$ ]]; then
  241. help >&2
  242. exit 1
  243. else
  244. "$@"
  245. fi