tm5.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. #! /usr/bin/env python
  2. import rc
  3. import os
  4. import sys
  5. import subprocess
  6. import glob
  7. from optparse import OptionParser
  8. from datetime import timedelta
  9. import time
  10. import re
  11. import pdb
  12. """
  13. Module with a class dedicated to TM5 rcfiles
  14. rctm5 : derived RcFile class, with specifics for TM5.
  15. In a module you can do:
  16. import tm5
  17. otm = tm5.rctm5('chem-test-cbm4.rc')
  18. otm.display()
  19. otm.run(queue=True)
  20. if otm.basic_checkrun():
  21. elapsed = otm.get_runtime(verbose=True)
  22. otm.cleanup()
  23. Note that can be initialized with a 'raw' rcfile, which let you modify keys
  24. before expansion:
  25. otm = tm5.rctm5('chem-test-cbm4.rc', raw=True)
  26. otm.replace( 'my.project.dir',' ${my.scratch}/another-proj-dir' )
  27. otm.WriteFile('my-new-rcfile') # needed for run to account for the change!
  28. ntm = tm5.rctm5('my-new-rcfile')
  29. ntm.run()
  30. """
  31. class rctm5(rc.RcFile):
  32. """
  33. Derived RcFile class, which:
  34. Adds methods:
  35. 'run'
  36. 'basic_checkrun'
  37. 'clean'
  38. Adds the following attributes (derived or existing in the rc dico but used a lot):
  39. self.rundone : T/F
  40. self.status : '', 'crashed', 'running'
  41. self.runtype : 'foreground', 'queue'
  42. self.region1 : name of first region (eg: 'glb600x400')
  43. self.restart : restart file
  44. self.timestamp : 'startdate_enddate' string used in mmix, budget,... filenames
  45. self.logcomp : name of log of setup_tm5 script
  46. self.rundir : run directory
  47. self.ok : tm5.ok fully qualified filename
  48. Also expands path of filename at init.
  49. """
  50. def __init__(self, filename, raw=False):
  51. rc.RcFile.__init__(self,os.path.expanduser(filename), raw=raw)
  52. self.setfilenames()
  53. self.rundone = False
  54. self.status = 'unknown'
  55. if raw:
  56. self.runtype = 'undefined'
  57. else:
  58. self.runtype = self.get('submit.to')
  59. self.basic_checkrun(verbose=False)
  60. self.runtime = {}
  61. def setfilenames(self):
  62. try:
  63. # restart filenames (original in netCDF 4 and converted to nc-3)
  64. outrestart = self.get('restart.write.dir' )
  65. etime = self.get('timerange.end' )
  66. hres = self.get('my.region1' )
  67. etime = etime.split()
  68. ymd = ''.join(etime[0].split('-'))
  69. hhmm = ''.join(etime[1][:5].split(':'))
  70. hh = ''.join(etime[1][:3].split(':'))
  71. fname = ''.join(["TM5_restart_", ymd, "_", hhmm, "_", hres, ".nc"])
  72. self.region1 = hres
  73. self.restart = os.path.join(outrestart, fname)
  74. # repeat for start time, and get YYYYMMDDHH_YYYYMMDDHH string
  75. # used in mmix, jstat filenames (useful for runs with ONE job only)
  76. stime = self.get('timerange.start' )
  77. stime = stime.split()
  78. symd = ''.join(stime[0].split('-'))
  79. shhmm = ''.join(stime[1][:5].split(':'))
  80. shh = ''.join(etime[1][:3].split(':'))
  81. self.timestamp = ''.join([symd, shh, "_", ymd, hh])
  82. # rundir, tm5.ok, ...
  83. self.rundir = self.get('my.run.dir')
  84. self.ok = os.path.join(self.rundir,'tm5.ok')
  85. except:
  86. self.restart = ""
  87. self.ok=''
  88. # Setup_tm5 script log (compilation if any)
  89. self.logcomp = os.path.join(os.curdir,
  90. os.path.splitext(self.filename)[0]+'.out')
  91. def display(self):
  92. """
  93. print basic info about tm5-rc obj.
  94. """
  95. print "RC file :", self.filename
  96. print "run dir :", self.get('my.run.dir')
  97. print "end restart :", self.restart
  98. print " ...exists :", os.path.isfile(self.restart)
  99. print "status :", self.status
  100. print "fg/bg/queue :", self.runtype
  101. print "log compil. :", self.logcomp
  102. print "--------------------------"
  103. def run(self, force=False, clean=False, queue=True):
  104. """
  105. Call setup_tm5, if final restart does not exist.
  106. If 'force', then runs even if the restart exists.
  107. If 'clean', re-compile everything ('build' dir is removed).
  108. If 'queue', use the queue manager, else run with the setting in the rc file.
  109. """
  110. if queue : self.runtype = "queue"
  111. tosubmit = force or (not os.path.isfile(self.restart))
  112. if tosubmit:
  113. command = [os.path.join(os.curdir,'setup_tm5'),
  114. self.filename,'-s']
  115. if queue: command.append("-q")
  116. if clean : command.append("-n")
  117. print "submitting run for "+ self.filename
  118. fo=open(self.logcomp,'w')
  119. retcode = subprocess.call( command, stdout=fo, stderr=subprocess.STDOUT)
  120. fo.close()
  121. if retcode != 0 :
  122. print "compilation failed. See: ", self.logcomp
  123. self.status = 'crashed'
  124. self.rundone = True
  125. raise Exception
  126. else:
  127. print "submit ok"
  128. self.status = 'running'
  129. self.rundone = False
  130. else:
  131. print 'skipping run for '+ self.filename+' (final restart already exists)'
  132. self.status = 'done'
  133. self.rundone = True
  134. def get_runtime(self, verbose=False, total=False):
  135. """
  136. Set and Get (return) runtime of all available legs into a dictionary of
  137. key,val = log filename [string], runtime [deltatime]
  138. for each leg.
  139. If total is True, simply returns total runtime in seconds.
  140. """
  141. exp = re.compile('submit_tm5_step_run - wall time after run.*(\d+):(\d+):(\d+) \(hh:mm:ss\)')
  142. rundir = self.get('my.run.dir')
  143. runid = self.get('my.basename')
  144. mask = ''.join([runid,'_[0-9][0-9][0-9]_run.out'])
  145. logs = glob.glob(os.path.join(rundir,mask))
  146. stat={}
  147. for fname in logs:
  148. with open(fname, 'r') as f:
  149. for line in f:
  150. match = exp.match(line)
  151. if match:
  152. stat[fname] = timedelta( hours = int(match.group(1)),
  153. minutes = int(match.group(2)),
  154. seconds = int(match.group(3)) )
  155. if verbose: print ''.join([os.path.basename(fname),':']), stat[fname]
  156. self.runtime = stat
  157. if stat :
  158. total_rt = timedelta(seconds=sum(dt.total_seconds() for dt in stat.values()))
  159. if verbose:
  160. if stat:
  161. print 'total runtime:', total_rt
  162. else:
  163. print 'no runtime available'
  164. if total:
  165. return total_rt
  166. else:
  167. return stat
  168. def basic_checkrun(self, restart=True, verbose=False):
  169. """
  170. Check if a run is successfully terminated by checking the
  171. existence of the tm5.ok file and optionally final restart file.
  172. Note this is not bullet proof: if between legs and final restart
  173. remains from a previous run.
  174. """
  175. if verbose: print "basic run check for "+self.filename,
  176. if os.path.exists(self.ok) and not (restart and not os.path.exists(self.restart)):
  177. if verbose: print ": sucessfully terminated"
  178. if verbose: print "--------------------------"
  179. self.rundone = True
  180. return True
  181. else:
  182. if verbose: print ": not finished or crashed"
  183. if verbose: print "--------------------------"
  184. return False
  185. def cleanup(self, full=False, verbose=False):
  186. if verbose: print "cleaning up : ",self.filename
  187. # -- Minimal
  188. if os.path.isfile(self.restart) : os.remove(self.restart)
  189. if os.path.isfile(self.logcomp) : os.remove(self.logcomp)
  190. # Empty rundir. Output and profiling are left in their
  191. # own directory (if different from the rundir)
  192. rundir = self.get('my.run.dir')
  193. if os.path.exists(rundir):
  194. files = [ os.path.join(rundir, f) for f in os.listdir(rundir) if
  195. os.path.isfile(os.path.join(rundir, f))]
  196. for f in files: os.remove(f)
  197. self.rundone = False
  198. # -- Full
  199. if full:
  200. for f in get_output_list(): os.remove(f)
  201. def get_output_list(self):
  202. # restart and log
  203. out=[self.restart, self.logcomp]
  204. out=[f for f in out if os.path.isfile(f)]
  205. # run dir
  206. rundir = self.get('my.run.dir')
  207. if os.path.exists(rundir):
  208. files = [ os.path.join(rundir, f) for f in os.listdir(rundir) if
  209. os.path.isfile(os.path.join(rundir, f))]
  210. out=out+files
  211. # output dir
  212. rundir = self.get('output.dir')
  213. if os.path.exists(rundir):
  214. files = [ os.path.join(rundir, f) for f in os.listdir(rundir) if
  215. os.path.isfile(os.path.join(rundir, f))]
  216. out=out+files
  217. # profile dir
  218. subdir = self.get('timing.output.subdir')
  219. rundir = os.path.join(rundir, subdir)
  220. if os.path.exists(rundir):
  221. files = [ os.path.join(rundir, f) for f in os.listdir(rundir) if
  222. os.path.isfile(os.path.join(rundir, f))]
  223. out=out+files
  224. return out
  225. def timers_dict(self, *timers):
  226. ''' Returns root timers of the first leg in a dictionary, if available.
  227. If strings arguments are passed, only their entries are returned.
  228. '''
  229. if self.get('timing.output') == 'T':
  230. # prf filename for root
  231. rundir = self.get('output.dir')
  232. subdir = self.get('timing.output.subdir')
  233. runid = self.get('my.basename')
  234. mask = ''.join([runid,'_001_0000.prf'])
  235. rootlog = os.path.join(rundir,subdir,mask)
  236. # scan prf files
  237. sep = "# index, total time, name"
  238. regB = re.compile(sep)
  239. sep = "# for each timer, total times spent on child processes"
  240. regE = re.compile(sep)
  241. isTimerLine = False
  242. alltimers={}
  243. with open(rootlog, 'r') as f:
  244. for line in f:
  245. begin = regB.match(line)
  246. end = regE.match(line)
  247. if begin:
  248. isTimerLine = True
  249. continue
  250. if end:
  251. isTimerLine = False
  252. if isTimerLine:
  253. splitline = line.split()
  254. alltimers[' '.join(splitline[2:])] = float(splitline[1])
  255. if timers:
  256. return dict((t, alltimers[t]) for t in timers if t in alltimers)
  257. else:
  258. return alltimers
  259. else:
  260. print self.filename + " doesn't have timers for first leg"