123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- #!/bin/sh
- #
- # UFla, May 2012
- #
- # This scripts visits (via ssh) all nodes associated with a certain batch job
- # and collects information about EC-EARTH 3 component processes. The
- # information is post-processed and printed.
- #
- # The script needs the nodes to be accessible with ssh and it is, in it's
- # current form, taylored to a special batch system. Hence, it is made for
- # ekman.pdc.kth.se. Modifications for other systems should be doable.
- #
- # Known issue: The script can be hard to kill when it's in it's ssh phase. Try
- # to kill it right after it has printed a line.
- strfmt="#%10s%10s%13s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n"
- line=$(printf "#%123s" " "|tr " " "-")
- echo $line
- printf $strfmt "" "" "" "" "IFS" "" "" "NEMO" "" "" "OASIS" ""
- printf $strfmt "Date" "Time" "Job ID" "min" "max" "sum" "min" "max" "sum" "min" "max" "sum"
- echo $line
- tmpfile=tmp.$$
- visit_cmd="ps --no-headers -o comm,rss -C ifsmaster-ecconf,nemo.exe,oasis3.MPI1.x"
- usage()
- {
- echo "Usage: $(basename $0) -j JOBID -t SLEEP_TIME_SEC"
- }
- meminfo()
- {
- awk -v prog=$2 '
- BEGIN {
- n=0
- sum=0
- min=1e30
- max=0
- }
- {
- if ($1~prog)
- {
- sum+=$2
- n++
- if ($2>max) max=$2
- if ($2<min) min=$2
- }
- }
- END {
- printf("%10.0f%10.0f%10.0f",min,max,sum)
- }
- ' $1
- }
- while getopts "j:t:" opt
- do
- case $opt in
- j) job_id=$OPTARG
- ;;
- t) sleep_time=$OPTARG
- ;;
- ?) usage
- exit 1
- ;;
- esac
- done
- while true
- do
- if [ -z "$job_id" ]
- then
- for j in $(spq -q -u ${USER} | awk '{print $2","$4}')
- do
- if expr index "$j" "run" > /dev/null
- then
- job_id="${j/,run/}"
- break
- fi
- done
- fi
-
- test -r $tmpfile && rm -f $tmpfile
- for n in $(spusage | awk '{print $1,$5}' | grep $job_id | awk '{print $1}')
- do
- if ! ssh $n $visit_cmd 2>/dev/null 1>> $tmpfile
- then
- echo "ssh to node $n failed."
- exit 1
- fi
- done
- test -r $tmpfile || exit 1
- printf " %10s%10s%13s" $(date +'%Y-%m-%d %T') $job_id
- meminfo $tmpfile "ifsmaster"
- meminfo $tmpfile "nemo"
- meminfo $tmpfile "oasis"
- echo
- rm -f $tmpfile
-
- test -z "$sleep_time" && break
-
- sleep $2
- done
|