visitor 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/bin/sh
  2. #
  3. # UFla, May 2012
  4. #
  5. # This scripts visits (via ssh) all nodes associated with a certain batch job
  6. # and collects information about EC-EARTH 3 component processes. The
  7. # information is post-processed and printed.
  8. #
  9. # The script needs the nodes to be accessible with ssh and it is, in it's
  10. # current form, taylored to a special batch system. Hence, it is made for
  11. # ekman.pdc.kth.se. Modifications for other systems should be doable.
  12. #
  13. # Known issue: The script can be hard to kill when it's in it's ssh phase. Try
  14. # to kill it right after it has printed a line.
  15. strfmt="#%10s%10s%13s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n"
  16. line=$(printf "#%123s" " "|tr " " "-")
  17. echo $line
  18. printf $strfmt "" "" "" "" "IFS" "" "" "NEMO" "" "" "OASIS" ""
  19. printf $strfmt "Date" "Time" "Job ID" "min" "max" "sum" "min" "max" "sum" "min" "max" "sum"
  20. echo $line
  21. tmpfile=tmp.$$
  22. visit_cmd="ps --no-headers -o comm,rss -C ifsmaster-ecconf,nemo.exe,oasis3.MPI1.x"
  23. usage()
  24. {
  25. echo "Usage: $(basename $0) -j JOBID -t SLEEP_TIME_SEC"
  26. }
  27. meminfo()
  28. {
  29. awk -v prog=$2 '
  30. BEGIN {
  31. n=0
  32. sum=0
  33. min=1e30
  34. max=0
  35. }
  36. {
  37. if ($1~prog)
  38. {
  39. sum+=$2
  40. n++
  41. if ($2>max) max=$2
  42. if ($2<min) min=$2
  43. }
  44. }
  45. END {
  46. printf("%10.0f%10.0f%10.0f",min,max,sum)
  47. }
  48. ' $1
  49. }
  50. while getopts "j:t:" opt
  51. do
  52. case $opt in
  53. j) job_id=$OPTARG
  54. ;;
  55. t) sleep_time=$OPTARG
  56. ;;
  57. ?) usage
  58. exit 1
  59. ;;
  60. esac
  61. done
  62. while true
  63. do
  64. if [ -z "$job_id" ]
  65. then
  66. for j in $(spq -q -u ${USER} | awk '{print $2","$4}')
  67. do
  68. if expr index "$j" "run" > /dev/null
  69. then
  70. job_id="${j/,run/}"
  71. break
  72. fi
  73. done
  74. fi
  75. test -r $tmpfile && rm -f $tmpfile
  76. for n in $(spusage | awk '{print $1,$5}' | grep $job_id | awk '{print $1}')
  77. do
  78. if ! ssh $n $visit_cmd 2>/dev/null 1>> $tmpfile
  79. then
  80. echo "ssh to node $n failed."
  81. exit 1
  82. fi
  83. done
  84. test -r $tmpfile || exit 1
  85. printf " %10s%10s%13s" $(date +'%Y-%m-%d %T') $job_id
  86. meminfo $tmpfile "ifsmaster"
  87. meminfo $tmpfile "nemo"
  88. meminfo $tmpfile "oasis"
  89. echo
  90. rm -f $tmpfile
  91. test -z "$sleep_time" && break
  92. sleep $2
  93. done