coral 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. #! /bin/bash
  2. #
  3. # Coral - collaborative job submission script manager for NEMO
  4. #
  5. output_script="run.sh"
  6. CORAL_HOME="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ; export CORAL_HOME
  7. HOSTNAME="$(hostname -s)" ; export HOSTNAME
  8. FQDN=$(hostname) ; export FQDN
  9. USER_EMAIL=$(getent passwd "${USER}" | egrep -o '[a-zA-Z0-9_-.]*@[]*\.[a-z]+') ; export USER_EMAIL
  10. MAINTAINER_EMAIL="damien.francois@uclouvain.be" ; export MAINTAINER_EMAIL
  11. source "${CORAL_HOME}"/utils/tools.inc
  12. init() {
  13. doc "$@" <<EOT
  14. init [<template name>] - Initialize current directory with files from template <template name>
  15. Templates contain configuration files, namelists, and xios configurations.
  16. By default, the template name is the machine short name (as given by hostname -s)
  17. EOT
  18. local template_name=${1:-${HOSTNAME}}
  19. local template_path=${CORAL_HOME}/templates/${template_name}
  20. [[ ! -d ${template_path} ]] && die 1 "Error: Template '${template_name}' not found."
  21. [[ -f experiment.cfg ]] && die 2 "Error: Directory not empty. Remove all *.cfg files."
  22. [[ -d namelists ]] && die 2 "Error: Directory not empty. Remove directory namelists."
  23. [[ -d xios_config ]] && die 2 "Error: Directory not empty. Remove directory xios_config."
  24. [[ -f ${output_script} ]] && die 2 "Error: Directory not empty. Remove ${output_script}"
  25. echo -e "Importing '${template_name}' template in the current directory..."
  26. < "${template_path}"/../files.txt column -t -s\|
  27. cp "${template_path}"/*cfg .
  28. cp -r "${template_path}"/xios_config .
  29. cp -r "${template_path}"/namelists .
  30. echo "${template_name}" > .coral_template
  31. echo -e "Done. Modify them to your needs and then run 'coral build' to build and"
  32. echo -e "review the submission script and 'coral submit' to submit the job."
  33. }
  34. build() {
  35. doc "$@" <<EOT
  36. build - Build submission script from the configuraiton files in the current directory.
  37. EOT
  38. [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first"
  39. echo -e "Generating job submission script '${output_script}'..."
  40. echo "#!/bin/bash" >${output_script} ; (
  41. set -e
  42. source experiment.cfg
  43. export exp_name
  44. export nem_config_name
  45. source programs.cfg
  46. export nem_numproc
  47. export xio_numproc
  48. export total_nb_cpu=$((nem_numproc + xio_numproc))
  49. export email=$USER_EMAIL
  50. submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command
  51. echo -e "\n#\n# Job options\n#"
  52. <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu
  53. echo -e "\n#\n# Experiment options\n#"
  54. <experiment.cfg egrep -v "(^\#|^$)"
  55. echo -e "\n#\n# Data configuration\n#"
  56. <data.cfg cat
  57. echo -e "\n#\n# Program configuration\n#"
  58. <programs.cfg egrep -v "(^\#|^$)"
  59. echo -e "\n#\n# Script logic\n#"
  60. <"${CORAL_HOME}"/templates/scripts/skeleton.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command
  61. ) >>${output_script} 2>/dev/null \
  62. || die 3 "Error: Syntax error in configuration files. Please review them."
  63. echo -e "Done. Run 'coral submit' to submit the job, or submit it manually."
  64. }
  65. build_isfcpl() {
  66. doc "$@" <<EOT
  67. build - Build submission script from the configuraiton files in the current directory.
  68. EOT
  69. [[ -f experiment.cfg ]] || die 1 "Error: configuration files not found. Run 'coral init' first"
  70. echo -e "Generating job submission script '${output_script}'..."
  71. echo "#!/bin/bash" >${output_script} ; (
  72. set -e
  73. source experiment.cfg
  74. export exp_name
  75. export nem_config_name
  76. source programs.cfg
  77. export nem_numproc
  78. export xio_numproc
  79. export total_nb_cpu=$((nem_numproc + xio_numproc))
  80. export email=$USER_EMAIL
  81. submit_command=$(grep -q SBATCH job.cfg && echo sbatch || echo qsub) ; export submit_command
  82. echo -e "\n#\n# Job options\n#"
  83. <job.cfg egrep -v "(^\#|^$)" | sed "s/SBATCH/#SBATCH/" | sed "s/PBS -/#PBS -/" | envsubst \$exp_name,\$email,\$total_nb_cpu
  84. echo -e "\n#\n# Experiment options\n#"
  85. <experiment.cfg egrep -v "(^\#|^$)"
  86. echo -e "\n#\n# Data configuration\n#"
  87. <data.cfg cat
  88. echo -e "\n#\n# Program configuration\n#"
  89. <programs.cfg egrep -v "(^\#|^$)"
  90. echo -e "\n#\n# Script logic\n#"
  91. <"${CORAL_HOME}"/templates/scripts/skeleton_isfcpl.sh egrep -v '^[[:space:]]*\#' | envsubst \$submit_command
  92. ) >>${output_script} 2>/dev/null \
  93. || die 3 "Error: Syntax error in configuration files. Please review them."
  94. echo -e "Done. Run 'coral submit' to submit the job, or submit it manually."
  95. }
  96. submit() {
  97. doc "$@" <<EOT
  98. submit [<options>] - Submit the job based on the submission script present in the current directory.
  99. options can include:
  100. * local: run the script locally rather than submitting it to the job scheduler
  101. * preponly: run only the part of the script that prepares the run (copies files, links data, etc.)
  102. * noresubmit: run only one job and prevent submission of continuation jobs.
  103. * fromscratch: restart computations from scratch ignoring checkpoint files. Use with care.
  104. * verbose: show everything during submission
  105. EOT
  106. [[ -f ${output_script} ]] || die 1 "Error: submission script not found. Run 'coral build' first"
  107. echo Making script executable and submitting it...
  108. chmod +x ${output_script}
  109. (
  110. set -e
  111. source experiment.cfg
  112. eval "$(grep out_dir job.cfg)"
  113. [[ "$@" == *fromscratch* ]] && rm -rf "${out_dir:?}"
  114. [[ "$@" == *local* ]] && { ./"${output_script}" "$@" ; exit 0; }
  115. which sbatch &> /dev/null && {
  116. jobid="$(sbatch "${output_script}" "$@")"
  117. sleep 2
  118. jobid=${jobid##* }
  119. mkdir -p "${out_dir:?}"
  120. echo "${jobid}" >> "${out_dir}"/.coral_jobs ;
  121. #squeue --start -j "${jobid}" # FIXME
  122. squeue -j "${jobid}"
  123. exit 0;
  124. }
  125. which qsub &> /dev/null && {
  126. if [[ $USER == "vsc"* ]]; then
  127. credits=$@
  128. if [ -z "$credits" ]; then
  129. credits="laerocloud"
  130. fi
  131. mkdir -p tmp
  132. echo ${output_script} > tmp/script_name
  133. echo ${PWD} > tmp/address
  134. qsub -A $credits "${output_script}" | tee -a coral_jobs;
  135. sleep 2
  136. jobid=`cat coral_jobs`
  137. rm -f coral_jobs
  138. jobid=${jobid%%.*}
  139. mkdir -p "${out_dir:?}"
  140. echo "${jobid}" >> "${out_dir}"/.coral_jobs ;
  141. qstat
  142. else
  143. qsub -v PBS_OPTIONS="$@" "${output_script}" | tee -a coral_jobs;
  144. sleep 2
  145. jobid=`cat coral_jobs`
  146. rm -f coral_jobs
  147. jobid=${jobid%.*}
  148. mkdir -p "${out_dir:?}"
  149. echo "${jobid}" >> "${out_dir}"/.coral_jobs ;
  150. qstat -J "${jobid}"
  151. fi
  152. exit 0;
  153. }
  154. )
  155. echo "Done."
  156. }
  157. save() {
  158. doc "$@" <<EOT
  159. save [<name>] - Save configuration files in the current template or create a new template if <name> is given.
  160. The new template can consequently be used with 'coral init <name>' in another directory'
  161. EOT
  162. local target
  163. local current_template
  164. local template
  165. local target
  166. current_template=$(<.coral_template)
  167. template=${1-$current_template}
  168. target=$CORAL_HOME/templates/$template
  169. [[ -z ${template} ]] && die 1 "Error: No template name defined."
  170. [[ -f experiment.cfg ]] || die 1 "Error: No configuration files found."
  171. [[ -d namelists ]] || die 1 "Error: namelist directory not found."
  172. [[ -d xios_config ]] || die 1 "Error: xios_config directory not found."
  173. echo "Saving template ${1-$current_template} to ${target}..."
  174. mkdir -p "${target}"
  175. rsync -q -va -- *.cfg namelists xios_config "${target}"
  176. echo "${template}" > .coral_template
  177. echo Done.
  178. }
  179. share() {
  180. doc "$@" <<EOT
  181. share [<name>] - Make template files for template '<name>' world readable and display
  182. instructions on how to import the template.'
  183. EOT
  184. local curr
  185. local target
  186. local current_template
  187. [[ -f .coral_template ]] || die 1 "No coral template found in current directory."
  188. current_template=$(<.coral_template)
  189. template=${1-$current_template}
  190. target=$CORAL_HOME/templates/${template}
  191. [[ -d ${target} ]] || save "${template}"
  192. echo "Making sure the template is world readable..."
  193. chmod o+rx "${target}"
  194. export curr=${target}
  195. while curr="$(dirname "${curr}")" ; do
  196. chmod o+x "${curr}"
  197. [[ "${curr}" == "$HOME" || "${curr}" == / ]] && break ;
  198. done
  199. echo -e "Done. Other users can now use template '${template}' by issuing \n coral import $target"
  200. }
  201. import() {
  202. doc "$@" <<EOT
  203. import <path> - Import template located at <path> into the local coral installation.
  204. EOT
  205. [[ -n $1 ]] || die 1 "Usage: $(basename "$0") import <path>"
  206. local template_name
  207. template_name=$(basename "$1")
  208. echo "Importing template in $1..."
  209. rsync -q -va "$1" "$CORAL_HOME"/templates
  210. echo "Done. You can test it with 'coral init ${template_name}'"
  211. }
  212. publish() {
  213. doc "$@" <<EOT
  214. publish [<template>] - Submits template to the central repository manager for inclusion
  215. in the main coral repository.
  216. EOT
  217. [[ -f .coral_template ]] || die 1 "Error: No template name defined."
  218. local current_template
  219. current_template=$(<.coral_template)
  220. export template=${1-$current_template}
  221. export target=$CORAL_HOME/templates/${template}
  222. [[ -d $target ]] || save "${template}"
  223. echo "Commiting template to local Hg repository and contacting repository master..."
  224. cd "$CORAL_HOME" && (
  225. hg status | grep -q "^?.*$template" && hg -q add "${target}"
  226. hg status | grep -q "$template" && hg -q commit "${target}" -m"Commit template ${template}" )
  227. <"${CORAL_HOME}"/templates/scripts/repomaster.txt envsubst \$HOSTNAME,\$FQDN,\$USER,\$CORAL_HOME |\
  228. mail -s "Coral template publish request" $MAINTAINER_EMAIL
  229. echo "Email to $MAINTAINER_EMAIL sent."
  230. }
  231. status() {
  232. doc "$@" <<EOT
  233. status - Displays the current status of the simulation.
  234. EOT
  235. [[ -f .coral_jobs ]] || die 2 "Error: no job information found in current directory. Make sure you run 'coral status' in the run directory."
  236. local jobid
  237. jobid=$(tail -1 .coral_jobs)
  238. [[ -f nemo.info ]] && {
  239. tail -4 nemo.info
  240. }
  241. which squeue &> /dev/null && squeue -j "${jobid}" &>/dev/null && {
  242. echo "Current job:"
  243. squeue -j "${jobid}"
  244. }
  245. [[ -f time.step ]] && {
  246. echo "Current timestep: (hit CTRL-C to stop)"
  247. (while [ -f time.step ] ;do echo -en "$(cat time.step)" ; sleep 1 ; echo -en "\e[0K\r" ; done)
  248. }
  249. }
  250. update() {
  251. doc "$@" <<EOT
  252. update - undocumented. Do not use.
  253. EOT
  254. ( cd "${CORAL_HOME}" && hg pull --update ; )
  255. }
  256. list() {
  257. doc "$@" <<EOT
  258. list - List all templates available in local coral install
  259. EOT
  260. ls -l "${CORAL_HOME}"/templates | grep -v files.txt | grep -v scripts
  261. }
  262. help() {
  263. cat <<RTFM
  264. Usage: $0 <command> [<arguments>], where command is one of:
  265. init - Initialize template in current directory. Default template is the machine name.
  266. build - Build submission script from template in current directory.
  267. build_isfcpl - Build submission script from template in current directory, incl. isfcpl coupling.
  268. submit - Submit job from submission script in current directory.
  269. status - Displays the current status of the simulation.
  270. save - Save modifications to template in current directory
  271. share - Share template with other users on the same machine
  272. import - Import template from other user on the same machine
  273. publish - Submit template for inclusing in central coral repository
  274. list - List all available templates
  275. Use 'coral <command> -h' for more information about a specific command.
  276. RTFM
  277. }
  278. completion() { #FIXME
  279. cat <<'EOTCOMPLETION'
  280. _coral () # By convention, the function name
  281. { #+ starts with an underscore.
  282. _get_comp_words_by_ref cur prev words cword
  283. _split_long_opt
  284. COMPREPLY=( $(compgen -W "init commit submit build build_isfcpl edit share save import publish update list status" -- $cur) )
  285. }
  286. complete -F _coral coral
  287. EOTCOMPLETION
  288. }
  289. if [[ ! "$1" =~ ^(init|commit|submit|build|build_isfcpl|edit|share|save|import|publish|update|list|status|completion)$ ]]; then
  290. help >&2
  291. exit 1
  292. else
  293. "$@"
  294. fi