coral 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. #! /bin/bash
  2. #
  3. # Coral - collaborative job submission script manager for NEMO
  4. #
  5. output_script="run.sh"
  6. CORAL_HOME="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ; export CORAL_HOME
  7. HOSTNAME="$(hostname -s)" ; export HOSTNAME
  8. FQDN=$(hostname) ; export FQDN
  9. USER_EMAIL=$(getent passwd "${USER}" | egrep -o '[a-zA-Z0-9_-.]*@[]*\.[a-z]+') ; export USER_EMAIL
  10. MAINTAINER_EMAIL="damien.francois@uclouvain.be" ; export MAINTAINER_EMAIL
  11. source "${CORAL_HOME}"/utils/tools.inc
  12. init() {
  13. doc "$@" <<EOT
  14. init [<template name>] - Initialize current directory with files from template <template name>
  15. Templates contain configuration files, namelists, and xios configurations.
  16. By default, the template name is the machine short name (as given by hostname -s)
  17. EOT
  18. local template_name=${1:-${HOSTNAME}}
  19. local template_path=${CORAL_HOME}/templates/${template_name}
  20. [[ ! -d ${template_path} ]] && die 1 "Error: Template '${template_name}' not found."
  21. [[ -f experiment.cfg ]] && die 2 "Error: Directory not empty. Remove all *.cfg files."
  22. [[ -d namelists ]] && die 2 "Error: Directory not empty. Remove directory namelists."
  23. [[ -d xios_config ]] && die 2 "Error: Directory not empty. Remove directory xios_config."
  24. [[ -f ${output_script} ]] && die 2 "Error: Directory not empty. Remove ${output_script}"
  25. echo -e "Importing '${template_name}' template in the current directory..."
  26. < "${template_path}"/../files.txt column -t -s\|
  27. cp "${template_path}"/*cfg .
  28. cp -r "${template_path}"/xios_config .
  29. cp -r "${template_path}"/namelists .
  30. echo "${template_name}" > .coral_template
  31. echo -e "Done. Modify them to your needs and then run 'coral build' to build and"
  32. echo -e "review the submission script and 'coral submit' to submit the job."
  33. }
  34. build() {
  35. doc "$@" <<EOT
  36. build - Build submission script from the configuraiton files in the current directory.
  37. EOT
  38. [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first"
  39. echo -e "Generating job submission script '${output_script}'..."
  40. echo "#!/bin/bash" >${output_script} ; (
  41. set -e
  42. source experiment.cfg
  43. export exp_name
  44. export nem_config_name
  45. source programs.cfg
  46. export nem_numproc
  47. export xio_numproc
  48. export total_nb_cpu=$((nem_numproc + xio_numproc))
  49. export email=$USER_EMAIL
  50. submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command
  51. echo -e "\n#\n# Job options\n#"
  52. <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu
  53. echo -e "\n#\n# Experiment options\n#"
  54. <experiment.cfg egrep -v "(^\#|^$)"
  55. echo -e "\n#\n# Data configuration\n#"
  56. <data.cfg cat
  57. echo -e "\n#\n# Program configuration\n#"
  58. <programs.cfg egrep -v "(^\#|^$)"
  59. echo -e "\n#\n# Script logic\n#"
  60. <"${CORAL_HOME}"/templates/scripts/skeleton.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command
  61. ) >>${output_script} 2>/dev/null \
  62. || die 3 "Error: Syntax error in configuration files. Please review them."
  63. echo -e "Done. Run 'coral submit' to submit the job, or submit it manually."
  64. }
  65. submit() {
  66. doc "$@" <<EOT
  67. submit [<options>] - Submit the job based on the submission script present in the current directory.
  68. options can include:
  69. * local: run the script locally rather than submitting it to the job scheduler
  70. * preponly: run only the part of the script that prepares the run (copies files, links data, etc.)
  71. * noresubmit: run only one job and prevent submission of continuation jobs.
  72. * fromscratch: restart computations from scratch ignoring checkpoint files. Use with care.
  73. * verbose: show everything during submission
  74. EOT
  75. [[ -f ${output_script} ]] || die 1 "Error: submission script not found. Run 'coral build' first"
  76. echo Making script executable and submitting it...
  77. chmod +x ${output_script}
  78. (
  79. set -e
  80. source experiment.cfg
  81. eval "$(grep run_dir job.cfg)"
  82. [[ "$@" == *fromscratch* ]] && rm -rf "${run_dir:?}"
  83. [[ "$@" == *local* ]] && { ./"${output_script}" "$@" ; exit 0; }
  84. which sbatch &> /dev/null && {
  85. jobid="$(sbatch "${output_script}" "$@")"
  86. sleep 2
  87. jobid=${jobid##* }
  88. mkdir -p "${run_dir:?}"
  89. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  90. #squeue --start -j "${jobid}" # FIXME
  91. squeue -j "${jobid}"
  92. exit 0;
  93. }
  94. which qsub &> /dev/null && {
  95. if [[ $USER == "vsc"* ]]; then
  96. credits=$@
  97. if [ -z "$credits" ]; then
  98. credits="laerocloud"
  99. fi
  100. mkdir -p tmp
  101. echo ${output_script} > tmp/script_name
  102. echo ${PWD} > tmp/address
  103. qsub -A $credits "${output_script}" | tee -a coral_jobs;
  104. sleep 2
  105. jobid=`cat coral_jobs`
  106. rm -f coral_jobs
  107. jobid=${jobid%%.*}
  108. mkdir -p "${run_dir:?}"
  109. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  110. qstat
  111. else
  112. qsub -v PBS_OPTIONS="$@" "${output_script}" | tee -a coral_jobs;
  113. sleep 2
  114. jobid=`cat coral_jobs`
  115. rm -f coral_jobs
  116. jobid=${jobid%.*}
  117. mkdir -p "${run_dir:?}"
  118. echo "${jobid}" >> "${run_dir}"/.coral_jobs ;
  119. qstat -J "${jobid}"
  120. fi
  121. exit 0;
  122. }
  123. )
  124. echo "Done."
  125. }
  126. save() {
  127. doc "$@" <<EOT
  128. save [<name>] - Save configuration files in the current template or create a new template if <name> is given.
  129. The new template can consequently be used with 'coral init <name>' in another directory'
  130. EOT
  131. local target
  132. local current_template
  133. local template
  134. local target
  135. current_template=$(<.coral_template)
  136. template=${1-$current_template}
  137. target=$CORAL_HOME/templates/$template
  138. [[ -z ${template} ]] && die 1 "Error: No template name defined."
  139. [[ -f experiment.cfg ]] || die 1 "Error: No configuration files found."
  140. [[ -d namelists ]] || die 1 "Error: namelist directory not found."
  141. [[ -d xios_config ]] || die 1 "Error: xios_config directory not found."
  142. echo "Saving template ${1-$current_template} to ${target}..."
  143. mkdir -p "${target}"
  144. rsync -q -va -- *.cfg namelists xios_config "${target}"
  145. echo "${template}" > .coral_template
  146. echo Done.
  147. }
  148. share() {
  149. doc "$@" <<EOT
  150. share [<name>] - Make template files for template '<name>' world readable and display
  151. instructions on how to import the template.'
  152. EOT
  153. local curr
  154. local target
  155. local current_template
  156. [[ -f .coral_template ]] || die 1 "No coral template found in current directory."
  157. current_template=$(<.coral_template)
  158. template=${1-$current_template}
  159. target=$CORAL_HOME/templates/${template}
  160. [[ -d ${target} ]] || save "${template}"
  161. echo "Making sure the template is world readable..."
  162. chmod o+rx "${target}"
  163. export curr=${target}
  164. while curr="$(dirname "${curr}")" ; do
  165. chmod o+x "${curr}"
  166. [[ "${curr}" == "$HOME" || "${curr}" == / ]] && break ;
  167. done
  168. echo -e "Done. Other users can now use template '${template}' by issuing \n coral import $target"
  169. }
  170. import() {
  171. doc "$@" <<EOT
  172. import <path> - Import template located at <path> into the local coral installation.
  173. EOT
  174. [[ -n $1 ]] || die 1 "Usage: $(basename "$0") import <path>"
  175. local template_name
  176. template_name=$(basename "$1")
  177. echo "Importing template in $1..."
  178. rsync -q -va "$1" "$CORAL_HOME"/templates
  179. echo "Done. You can test it with 'coral init ${template_name}'"
  180. }
  181. publish() {
  182. doc "$@" <<EOT
  183. publish [<template>] - Submits template to the central repository manager for inclusion
  184. in the main coral repository.
  185. EOT
  186. [[ -f .coral_template ]] || die 1 "Error: No template name defined."
  187. local current_template
  188. current_template=$(<.coral_template)
  189. export template=${1-$current_template}
  190. export target=$CORAL_HOME/templates/${template}
  191. [[ -d $target ]] || save "${template}"
  192. echo "Commiting template to local Hg repository and contacting repository master..."
  193. cd "$CORAL_HOME" && (
  194. hg status | grep -q "^?.*$template" && hg -q add "${target}"
  195. hg status | grep -q "$template" && hg -q commit "${target}" -m"Commit template ${template}" )
  196. <"${CORAL_HOME}"/templates/scripts/repomaster.txt envsubst \$HOSTNAME,\$FQDN,\$USER,\$CORAL_HOME |\
  197. mail -s "Coral template publish request" $MAINTAINER_EMAIL
  198. echo "Email to $MAINTAINER_EMAIL sent."
  199. }
  200. status() {
  201. doc "$@" <<EOT
  202. status - Displays the current status of the simulation.
  203. EOT
  204. [[ -f .coral_jobs ]] || die 2 "Error: no job information found in current directory. Make sure you run 'coral status' in the run directory."
  205. local jobid
  206. jobid=$(tail -1 .coral_jobs)
  207. [[ -f nemo.info ]] && {
  208. tail -4 nemo.info
  209. }
  210. which squeue &> /dev/null && squeue -j "${jobid}" &>/dev/null && {
  211. echo "Current job:"
  212. squeue -j "${jobid}"
  213. }
  214. [[ -f time.step ]] && {
  215. echo "Current timestep: (hit CTRL-C to stop)"
  216. (while [ -f time.step ] ;do echo -en "$(cat time.step)" ; sleep 1 ; echo -en "\e[0K\r" ; done)
  217. }
  218. }
  219. update() {
  220. doc "$@" <<EOT
  221. update - undocumented. Do not use.
  222. EOT
  223. ( cd "${CORAL_HOME}" && hg pull --update ; )
  224. }
  225. list() {
  226. doc "$@" <<EOT
  227. list - List all templates available in local coral install
  228. EOT
  229. ls -l "${CORAL_HOME}"/templates | grep -v files.txt | grep -v scripts
  230. }
  231. help() {
  232. cat <<RTFM
  233. Usage: $0 <command> [<arguments>], where command is one of:
  234. init - Initialize template in current directory. Default template is the machine name.
  235. build - Build submission script from template in current directory.
  236. submit - Submit job from submission script in current directory.
  237. status - Displays the current status of the simulation.
  238. save - Save modifications to template in current directory
  239. share - Share template with other users on the same machine
  240. import - Import template from other user on the same machine
  241. publish - Submit template for inclusing in central coral repository
  242. list - List all available templates
  243. Use 'coral <command> -h' for more information about a specific command.
  244. RTFM
  245. }
  246. completion() { #FIXME
  247. cat <<'EOTCOMPLETION'
  248. _coral () # By convention, the function name
  249. { #+ starts with an underscore.
  250. _get_comp_words_by_ref cur prev words cword
  251. _split_long_opt
  252. COMPREPLY=( $(compgen -W "init commit submit build edit share save import publish update list status" -- $cur) )
  253. }
  254. complete -F _coral coral
  255. EOTCOMPLETION
  256. }
  257. if [[ ! "$1" =~ ^(init|commit|submit|build|edit|share|save|import|publish|update|list|status|completion)$ ]]; then
  258. help >&2
  259. exit 1
  260. else
  261. "$@"
  262. fi