aurora.tmpl 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. #!/bin/bash
  2. #
  3. # submit.sh
  4. #
  5. # Portable bash script to run LPJ-GUESS version:
  6. # BINARY
  7. # as a parallel job using SLURM on Aurora.
  8. #
  9. # Created automatically on DATE
  10. #
  11. # Usage:
  12. #
  13. # 1. Copy script to the directory where you want output written.
  14. # This will be called the RUN DIRECTORY.
  15. # 2. In an editor, set appropriate values for the variables NPROCESS,
  16. # INSFILE, GRIDLIST and OUTFILES (NB: no space after the = sign):
  17. NPROCESS=20 # NB: Should be multiple of 20 on Aurora!
  18. WALLTIME=150:00:00
  19. INSFILE=guess.ins
  20. INPUT_MODULE=cru_ncep
  21. GRIDLIST=gridlist.txt
  22. OUTFILES='*.out'
  23. # Where:
  24. # NPROCESS = number of processes in parallel job
  25. # WALLTIME = maximum wall (real) time for job hh:mm:ss
  26. # INSFILE = path to ins file from run directory
  27. # INPUT_MODULE = input module to use
  28. # GRIDLIST = path to gridlist file from run directory
  29. # OUTFILES = list of LPJ-GUESS output files in single quotes,
  30. # and separated by spaces (filenames only, including
  31. # extension, no directory.) Shell wildcards are allowed.
  32. #
  33. # 3. Run the script using the command:
  34. # ./submit.sh
  35. # or:
  36. # ./submit.sh [-n <name>] [-s <file>] [-i <ins-file>]
  37. #
  38. # All arguments are optional and interpreted as:
  39. # name = the name of the job (shown in PBS queue)
  40. # file = filename of a file which can override the variables
  41. # above
  42. # ins-file = instruction file to use, overrides the INSFILE
  43. # variable above
  44. #
  45. # Nothing to change past here
  46. ########################################################################
  47. # Exit if any command fails
  48. set -e
  49. # Handle the command line arguments
  50. while getopts ":n:s:i:" opt; do
  51. case $opt in
  52. n ) name=$OPTARG ;;
  53. s ) submit_vars_file=$OPTARG ;;
  54. i ) ins=$OPTARG ;;
  55. esac
  56. done
  57. # Override the submit variables with the contents of a file, if given
  58. if [ -n "$submit_vars_file" ]; then
  59. source $submit_vars_file
  60. fi
  61. # Override INSFILE with the ins-file parameter, if given
  62. if [ -n "$ins" ]; then
  63. INSFILE=$ins
  64. fi
  65. # On Aurora, the recommendation is to submit jobs with the --exclusive
  66. # option, so we get exclusive nodes. Since each node has 20 cores, we
  67. # should set NPROCESS to a multiple of 20 to avoid waste.
  68. # If you really want to, you could remove this check and the --exclusive
  69. # option below, but your jobs might then be disturbed by other jobs
  70. # sharing your nodes.
  71. CORES_PER_NODE=20
  72. if [[ $((NPROCESS%CORES_PER_NODE)) != 0 ]]; then
  73. echo "Please set NPROCESS to a multiple of 20 on Aurora!" >&2
  74. exit 1
  75. fi
  76. # Convert INSFILE to an absolute path since we will be starting the
  77. # guess instances from different directories.
  78. # Please note when porting this script: readlink may not be available
  79. # on non-Linux systems. Also, using absolute path names means the
  80. # instruction file needs to be in a place accessible from the nodes.
  81. INSFILE=$(readlink -f "$INSFILE")
  82. GRIDLIST_FILENAME=$(basename $GRIDLIST)
  83. # This function creates the gridlist files for each run by splitting
  84. # the original gridlist file into approximately equal parts.
  85. function split_gridlist {
  86. # Create empty gridlists first to make sure each run gets one
  87. for ((a=1; a <= NPROCESS ; a++))
  88. do
  89. echo > run$a/$GRIDLIST_FILENAME
  90. done
  91. # Figure out suitable number of lines per gridlist, get the number of
  92. # lines in original gridlist file, divide by NPROCESS and round up.
  93. local lines_per_run=$(wc -l $GRIDLIST | \
  94. awk '{ x = $1/'$NPROCESS'; d = (x == int(x)) ? x : int(x)+1; print d}')
  95. # Use the split command to split the files into temporary files
  96. split --suffix-length=4 --lines $lines_per_run $GRIDLIST tmpSPLITGRID_
  97. # Move the temporary files into the runX-directories
  98. local files=$(ls tmpSPLITGRID_*)
  99. local i=1
  100. for file in $files
  101. do
  102. mv $file run$i/$GRIDLIST_FILENAME
  103. i=$((i+1))
  104. done
  105. }
  106. # Create header of progress.sh script
  107. echo "##############################################################" > progress.sh
  108. echo "# PROGRESS.SH" >> progress.sh
  109. echo "# Upload current guess.log files from local nodes and check" >> progress.sh
  110. echo "# Usage: sh progress.sh" >> progress.sh
  111. echo >> progress.sh
  112. # Create a run subdirectory for each process and clean up
  113. for ((a=1; a <= NPROCESS ; a++))
  114. do
  115. mkdir -p run$a
  116. cd run$a ; rm -f guess.log ; rm -f $GRIDLIST_FILENAME ; cd ..
  117. echo "echo '********** Last few lines of ./run${a}/guess.log: **********'" >> progress.sh
  118. echo "tail ./run${a}/guess.log" >> progress.sh
  119. done
  120. split_gridlist
  121. # Create SLURM script to request place in queue
  122. cat <<EOF > guess.cmd
  123. #!/bin/bash
  124. #SBATCH -n $NPROCESS
  125. #SBATCH --time=$WALLTIME
  126. #SBATCH --exclusive
  127. set -e
  128. if ! type -P mpirun &> /dev/null; then
  129. echo "Didn't find mpirun! Make sure an MPI module is loaded in your" >&2
  130. echo "login script (~/.bashrc) and recompile LPJ-GUESS with MPI support!" >&2
  131. exit 1
  132. fi
  133. # If there's a script for setting up files on local disk, run it
  134. if [ -f setup_local.sh ]; then
  135. srun -n \$SLURM_NNODES -N \$SLURM_NNODES setup_local.sh
  136. fi
  137. # In each run directory, create a symbolic link to the node local storage
  138. for ((a=1; a <= $NPROCESS ; a++))
  139. do
  140. cd run\$a
  141. if [ -h local ]; then
  142. rm local
  143. fi
  144. ln -s \$SNIC_TMP local
  145. cd ..
  146. done
  147. mpirun -bind-to core BINARY -parallel -input $INPUT_MODULE $INSFILE
  148. EOF
  149. cat <<EOF > append.cmd
  150. #!/bin/bash
  151. #SBATCH -n 1
  152. #SBATCH --time=$WALLTIME
  153. set -e
  154. function append_files {
  155. local number_of_jobs=\$1
  156. local file=\$2
  157. cp run1/\$file \$file
  158. local i=""
  159. for ((i=2; i <= number_of_jobs; i++))
  160. do
  161. if [ -f run\$i/\$file ]; then
  162. cat run\$i/\$file | awk 'NR!=1 || NF==0 || \$1 == \$1+0 { print \$0 }' >> \$file
  163. fi
  164. done
  165. }
  166. pushd run1 &> /dev/null
  167. outfiles_unexpanded='$OUTFILES'
  168. outfiles_expanded=\$(echo \$outfiles_unexpanded)
  169. popd &> /dev/null
  170. for file in \$outfiles_expanded
  171. do
  172. append_files $NPROCESS \$file
  173. done
  174. cat run*/guess.log > guess.log
  175. EOF
  176. # Submit guess job
  177. append_dependency=$(sbatch -J ${name:-"guess"} guess.cmd | awk '{print $NF}')
  178. # Submit append job
  179. sbatch --dependency=afterok:$append_dependency -J ${name:-"guess"}"_append" append.cmd | awk '{print $NF}'