coral 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #! /bin/bash
  2. #
  3. # Coral - collaborative job submission script manager for NEMO
  4. #
  5. output_script="run.sh"
  6. CORAL_HOME="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ; export CORAL_HOME
  7. HOSTNAME="$(hostname -s)" ; export HOSTNAME
  8. FQDN=$(hostname) ; export FQDN
  9. USER_EMAIL=$(getent passwd "${USER}" | egrep -o '[a-zA-Z0-9_-.]*@[]*\.[a-z]+') ; export USER_EMAIL
  10. MAINTAINER_EMAIL="damien.francois@uclouvain.be" ; export MAINTAINER_EMAIL
  11. source "${CORAL_HOME}"/utils/tools.inc
  12. init() {
  13. doc "$@" <<EOT
  14. init [<template name>] - Initialize current directory with files from template <template name>
  15. Templates contain configuration files, namelists, and xios configurations.
  16. By default, the template name is the machine short name (as given by hostname -s)
  17. EOT
  18. local template_name=${1:-${HOSTNAME}}
  19. local template_path=${CORAL_HOME}/templates/${template_name}
  20. [[ ! -d ${template_path} ]] && die 1 "Error: Template '${template_name}' not found."
  21. [[ -f experiment.cfg ]] && die 2 "Error: Directory not empty. Remove all *.cfg files."
  22. [[ -d namelists ]] && die 2 "Error: Directory not empty. Remove directory namelists."
  23. [[ -d xios_config ]] && die 2 "Error: Directory not empty. Remove directory xios_config."
  24. [[ -f ${output_script} ]] && die 2 "Error: Directory not empty. Remove ${output_script}"
  25. echo -e "Importing '${template_name}' template in the current directory..."
  26. < "${template_path}"/../files.txt column -t -s\|
  27. cp "${template_path}"/*cfg .
  28. cp -r "${template_path}"/xios_config .
  29. cp -r "${template_path}"/namelists .
  30. echo "${template_name}" > .coral_template
  31. echo -e "Done. Modify them to your needs and then run 'coral build' to build and"
  32. echo -e "review the submission script and 'coral submit' to submit the job."
  33. }
  34. build() {
  35. doc "$@" <<EOT
  36. build - Build submission script from the configuraiton files in the current directory.
  37. EOT
  38. [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first"
  39. echo -e "Generating job submission script '${output_script}'..."
  40. echo "#!/bin/bash" >${output_script} ; (
  41. set -e
  42. source experiment.cfg
  43. export exp_name
  44. export nem_config_name
  45. source programs.cfg
  46. export nem_numproc
  47. export xio_numproc
  48. export total_nb_cpu=$((nem_numproc + xio_numproc))
  49. export email=$USER_EMAIL
  50. submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command
  51. echo -e "\n#\n# Job options\n#"
  52. <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu
  53. echo -e "\n#\n# Experiment options\n#"
  54. <experiment.cfg egrep -v "(^\#|^$)"
  55. echo -e "\n#\n# Data configuration\n#"
  56. <data.cfg cat
  57. echo -e "\n#\n# Program configuration\n#"
  58. <programs.cfg egrep -v "(^\#|^$)"
  59. echo -e "\n#\n# Script logic\n#"
  60. <"${CORAL_HOME}"/templates/scripts/skeleton.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command
  61. ) >>${output_script} 2>/dev/null \
  62. || die 3 "Error: Syntax error in configuration files. Please review them."
  63. echo -e "Done. Run 'coral submit' to submit the job, or submit it manually."
  64. }
  65. submit() {
  66. doc "$@" <<EOT
  67. submit [<options>] - Submit the job based on the submission script present in the current directory.
  68. options can include:
  69. * local: run the script locally rather than submitting it to the job scheduler
  70. * preponly: run only the part of the script that prepares the run (copies files, links data, etc.)
  71. * noresubmit: run only one job and prevent submission of continuation jobs.
  72. * fromscratch: restart computations from scratch ignoring checkpoint files. Use with care.
  73. * verbose: show everything during submission
  74. EOT
  75. [[ -f ${output_script} ]] || die 1 "Error: submission script not found. Run 'coral build' first"
  76. echo Making script executable and submitting it...
  77. chmod +x ${output_script}
  78. (
  79. set -e
  80. source experiment.cfg
  81. eval "$(grep run_dir job.cfg)"
  82. [[ "$@" == *fromscratch* ]] && rm -rf "${run_dir:?}"
  83. [[ "$@" == *local* ]] && { ./"${output_script}" "$@" ; exit 0; }
  84. which sbatch &> /dev/null && {
  85. jobid="$(sbatch "${output_script}" "$@")"
  86. sleep 2
  87. jobid=${jobid##* }
  88. mkdir -p "${run_dir:?}"
  89. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  90. #squeue --start -j "${jobid}" # FIXME
  91. squeue -j "${jobid}"
  92. exit 0;
  93. }
  94. which qsub &> /dev/null && {
  95. if [[ $USER == "vsc"* ]]; then
  96. credits=$@
  97. if [ -z "$credits" ]; then
  98. credits="laerocloud"
  99. fi
  100. echo ${output_script} > script_name
  101. echo ${PWD} > address
  102. qsub -A $credits "${output_script}" | tee -a coral_jobs;
  103. sleep 2
  104. jobid=`cat coral_jobs`
  105. rm -f coral_jobs
  106. jobid=${jobid%%.*}
  107. mkdir -p "${run_dir:?}"
  108. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  109. qstat
  110. else
  111. qsub -v PBS_OPTIONS="$@" "${output_script}" | tee -a coral_jobs;
  112. sleep 2
  113. jobid=`cat coral_jobs`
  114. rm -f coral_jobs
  115. jobid=${jobid%.*}
  116. mkdir -p "${run_dir:?}"
  117. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  118. qstat -J "${jobid}"
  119. fi
  120. exit 0;
  121. }
  122. )
  123. echo "Done."
  124. }
  125. save() {
  126. doc "$@" <<EOT
  127. save [<name>] - Save configuration files in the current template or create a new template if <name> is given.
  128. The new template can consequently be used with 'coral init <name>' in another directory'
  129. EOT
  130. local target
  131. local current_template
  132. local template
  133. local target
  134. current_template=$(<.coral_template)
  135. template=${1-$current_template}
  136. target=$CORAL_HOME/templates/$template
  137. [[ -z ${template} ]] && die 1 "Error: No template name defined."
  138. [[ -f experiment.cfg ]] || die 1 "Error: No configuration files found."
  139. [[ -d namelists ]] || die 1 "Error: namelist directory not found."
  140. [[ -d xios_config ]] || die 1 "Error: xios_config directory not found."
  141. echo "Saving template ${1-$current_template} to ${target}..."
  142. mkdir -p "${target}"
  143. rsync -q -va -- *.cfg namelists xios_config "${target}"
  144. echo "${template}" > .coral_template
  145. echo Done.
  146. }
  147. share() {
  148. doc "$@" <<EOT
  149. share [<name>] - Make template files for template '<name>' world readable and display
  150. instructions on how to import the template.'
  151. EOT
  152. local curr
  153. local target
  154. local current_template
  155. [[ -f .coral_template ]] || die 1 "No coral template found in current directory."
  156. current_template=$(<.coral_template)
  157. template=${1-$current_template}
  158. target=$CORAL_HOME/templates/${template}
  159. [[ -d ${target} ]] || save "${template}"
  160. echo "Making sure the template is world readable..."
  161. chmod o+rx "${target}"
  162. export curr=${target}
  163. while curr="$(dirname "${curr}")" ; do
  164. chmod o+x "${curr}"
  165. [[ "${curr}" == "$HOME" || "${curr}" == / ]] && break ;
  166. done
  167. echo -e "Done. Other users can now use template '${template}' by issuing \n coral import $target"
  168. }
  169. import() {
  170. doc "$@" <<EOT
  171. import <path> - Import template located at <path> into the local coral installation.
  172. EOT
  173. [[ -n $1 ]] || die 1 "Usage: $(basename "$0") import <path>"
  174. local template_name
  175. template_name=$(basename "$1")
  176. echo "Importing template in $1..."
  177. rsync -q -va "$1" "$CORAL_HOME"/templates
  178. echo "Done. You can test it with 'coral init ${template_name}'"
  179. }
  180. publish() {
  181. doc "$@" <<EOT
  182. publish [<template>] - Submits template to the central repository manager for inclusion
  183. in the main coral repository.
  184. EOT
  185. [[ -f .coral_template ]] || die 1 "Error: No template name defined."
  186. local current_template
  187. current_template=$(<.coral_template)
  188. export template=${1-$current_template}
  189. export target=$CORAL_HOME/templates/${template}
  190. [[ -d $target ]] || save "${template}"
  191. echo "Commiting template to local Hg repository and contacting repository master..."
  192. cd "$CORAL_HOME" && (
  193. hg status | grep -q "^?.*$template" && hg -q add "${target}"
  194. hg status | grep -q "$template" && hg -q commit "${target}" -m"Commit template ${template}" )
  195. <"${CORAL_HOME}"/templates/scripts/repomaster.txt envsubst \$HOSTNAME,\$FQDN,\$USER,\$CORAL_HOME |\
  196. mail -s "Coral template publish request" $MAINTAINER_EMAIL
  197. echo "Email to $MAINTAINER_EMAIL sent."
  198. }
  199. status() {
  200. doc "$@" <<EOT
  201. status - Displays the current status of the simulation.
  202. EOT
  203. [[ -f .coral_jobs ]] || die 2 "Error: no job information found in current directory. Make sure you run 'coral status' in the run directory."
  204. local jobid
  205. jobid=$(tail -1 .coral_jobs)
  206. [[ -f nemo.info ]] && {
  207. tail -4 nemo.info
  208. }
  209. which squeue &> /dev/null && squeue -j "${jobid}" &>/dev/null && {
  210. echo "Current job:"
  211. squeue -j "${jobid}"
  212. }
  213. [[ -f time.step ]] && {
  214. echo "Current timestep: (hit CTRL-C to stop)"
  215. (while [ -f time.step ] ;do echo -en "$(cat time.step)" ; sleep 1 ; echo -en "\e[0K\r" ; done)
  216. }
  217. }
  218. update() {
  219. doc "$@" <<EOT
  220. update - undocumented. Do not use.
  221. EOT
  222. ( cd "${CORAL_HOME}" && hg pull --update ; )
  223. }
  224. list() {
  225. doc "$@" <<EOT
  226. list - List all templates available in local coral install
  227. EOT
  228. ls -l "${CORAL_HOME}"/templates | grep -v files.txt | grep -v scripts
  229. }
  230. help() {
  231. cat <<RTFM
  232. Usage: $0 <command> [<arguments>], where command is one of:
  233. init - Initialize template in current directory. Default template is the machine name.
  234. build - Build submission script from template in current directory.
  235. submit - Submit job from submission script in current directory.
  236. status - Displays the current status of the simulation.
  237. save - Save modifications to template in current directory
  238. share - Share template with other users on the same machine
  239. import - Import template from other user on the same machine
  240. publish - Submit template for inclusing in central coral repository
  241. list - List all available templates
  242. Use 'coral <command> -h' for more information about a specific command.
  243. RTFM
  244. }
  245. completion() { #FIXME
  246. cat <<'EOTCOMPLETION'
  247. _coral () # By convention, the function name
  248. { #+ starts with an underscore.
  249. _get_comp_words_by_ref cur prev words cword
  250. _split_long_opt
  251. COMPREPLY=( $(compgen -W "init commit submit build edit share save import publish update list status" -- $cur) )
  252. }
  253. complete -F _coral coral
  254. EOTCOMPLETION
  255. }
  256. if [[ ! "$1" =~ ^(init|commit|submit|build|edit|share|save|import|publish|update|list|status|completion)$ ]]; then
  257. help >&2
  258. exit 1
  259. else
  260. "$@"
  261. fi