submit_tm5_tools.py 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466
  1. #
  2. # TM5 run tools
  3. #
  4. # ***
  5. def Command_Line( rcf, exe, args, in_debugger ) :
  6. """
  7. Return command line that runs executable.
  8. ARGUMENTS
  9. rcf
  10. Rcfile with settings.
  11. exe
  12. Name of executable.
  13. args
  14. Arguments to be passed to executable.
  15. in_debugger
  16. Set to 'True' if the job should be run in a debugger.
  17. RETURN VALUES
  18. cmndline
  19. Command line to be executed.
  20. """
  21. # external
  22. import socket
  23. import logging
  24. # mpi run ?
  25. if rcf.get('par.mpi','bool') :
  26. # number of mpi tasks:
  27. ntask = rcf.get('par.ntask','int')
  28. # get command line:
  29. cmnd_exec = rcf.get('mpirun.command')
  30. cmnd_args = rcf.get('mpirun.args' )
  31. # write command file ?
  32. cmdfile = rcf.get('mpirun.cmdfile',default='')
  33. if len(cmdfile) > 0 :
  34. # write command line for each task:
  35. f = open(cmdfile,'w')
  36. for i in range(ntask) : f.write( '%s %s\n' % (exe,args) )
  37. f.close()
  38. else :
  39. # otherwise, add the executable and its arguments:
  40. cmnd_args = '%s %s %s' % (cmnd_args,exe,args)
  41. #endif
  42. # write host file ? PLS: This is done too early, and should be done
  43. # inside the *_run.jb script : that will let you specify nodes
  44. # scattered on different hosts (eg, linux cluster), and also different
  45. # from the current host (which probably is a login node)!! See
  46. # WriteJob below for cases of Loadleveler and Slurm.
  47. #
  48. # Leave it here for other cases not handled in WriteJob yet.
  49. hostfile = rcf.get('mpirun.hostfile',default='')
  50. if len(hostfile) > 0 :
  51. # get hostname:
  52. hname = socket.gethostname()
  53. # write hostname for each task:
  54. f = open(hostfile,'w')
  55. for i in range(ntask) : f.write( '%s\n' % hname )
  56. f.close()
  57. else :
  58. # standard run:
  59. cmnd_exec = exe
  60. cmnd_args = args
  61. # run in debugger ?
  62. if in_debugger :
  63. # debugger type:
  64. debugger = rcf.get( 'debugger' )
  65. # get debugger command:
  66. debugger_call = rcf.get( 'debugger.command' )
  67. # large differences ...
  68. if debugger == 'totalview' :
  69. # syntaxis: totalview <executable> [-a <arguments>]
  70. # pass executable:
  71. cmndline = '%s %s' % (debugger_call,cmnd_exec)
  72. # add arguments ?
  73. if len(cmnd_args) > 0 :
  74. cmndline = '%s -a %s' % (cmndline,cmnd_args)
  75. #endif
  76. elif debugger == 'idb' :
  77. # syntaxis: idb [-args <executable> <arguments>]
  78. # fill executable and arguments:
  79. cmndline = '%s -args %s %s' % (debugger_call,cmnd_exec,cmnd_args)
  80. else :
  81. logging.error('unsupported debugger : %s' % debugger )
  82. raise Exception
  83. #endif
  84. else :
  85. # standard line:
  86. cmndline = '%s %s' % (cmnd_exec,cmnd_args)
  87. # ok
  88. return cmndline
  89. # ***
  90. def WriteAndSubmitNewJob( rcfile, bindir ) :
  91. """
  92. Write first or next rcfile and job files(s) in the job chain;
  93. if chain is not finished yet, submit a new job.
  94. The argument could be the name of an rcfile or an rcfile object itself,
  95. since the submit script might have changed some values given
  96. the provided command line arguments.
  97. The following function is used:
  98. submit_tm5_setup_rcfile.WriteRcfile # writes the new rcfile
  99. This is placed in a seperate file since users might need to
  100. change these routines for their specific projects.
  101. """
  102. # external:
  103. import sys
  104. import logging
  105. import rc
  106. # import setup module:
  107. import submit_tm5_setup_rcfile
  108. # name provided ?
  109. if type(rcfile) == str :
  110. # load:
  111. rcf = rc.RcFile( rcfile )
  112. else :
  113. # just copy ..
  114. rcf = rcfile
  115. #endif
  116. # write next rfile, return name:
  117. try :
  118. rcfile_next = submit_tm5_setup_rcfile.WriteRcfile( rcf )
  119. except :
  120. logging.error( sys.exc_info()[1] )
  121. logging.error( 'exception from WriteRcfile' )
  122. raise Exception
  123. # finished ?
  124. if rcfile_next == None :
  125. logging.info( ' end of job chain !' )
  126. else :
  127. # write job file(s) for this period and return the (first) name;
  128. # last command in a file should submit the next job if necessary:
  129. logging.info( ' write jobfile for %s ...' % rcfile_next )
  130. try :
  131. jobfile_next = WriteJob( rcfile_next, bindir )
  132. except :
  133. logging.error( sys.exc_info()[1] )
  134. logging.error( 'exception from WriteJob' )
  135. raise Exception
  136. logging.info( ' submit next job : %s' % jobfile_next )
  137. try :
  138. jobid = SubmitJob( jobfile_next, rcfile_next )
  139. except :
  140. logging.error( sys.exc_info()[1] )
  141. logging.error( 'exception from SubmitJob' )
  142. raise Exception
  143. return
  144. # ***
  145. def WriteJob( rcfile, bindir ) :
  146. """
  147. jobfile = WriteJob(rcfile)
  148. Write job file given the settings in rcfile.
  149. The name of the jobfile is based on the name of the rcfile.
  150. The last command in the job should submit the next job,
  151. and the script is therefore written in python.
  152. """
  153. # external:
  154. import os
  155. import rc
  156. # load settings:
  157. rcf = rc.RcFile( rcfile )
  158. # basename for scripts etc is name of rcfile minus extension:
  159. bname,ext = os.path.splitext(rcfile)
  160. # loadleveler supports master job with multiple steps:
  161. with_master_job = (rcf.get('submit.to') == 'queue') and (rcf.get('queue') == 'loadleveler')
  162. # which shell ?
  163. job_shell = '/usr/bin/env python'
  164. # start master job ?
  165. if with_master_job :
  166. ntasks = rcf.get('par.ntask')
  167. # name of jobfile:
  168. jobfile = '%s.jb' % bname
  169. # set header:
  170. header = []
  171. header.append( '#! %s\n' % job_shell )
  172. header.append( '\n' )
  173. # init queue options:
  174. qopt = QueueOptions( bname, rcf, 'default' )
  175. # init job file:
  176. job = []
  177. job.append( '# external:\n' )
  178. job.append( 'import os\n' )
  179. job.append( 'import sys\n' )
  180. job.append( 'import socket\n')
  181. job.append( 'import subprocess\n' )
  182. job.append( 'import logging\n' )
  183. job.append( '\n' )
  184. job.append( '# setup messages:\n' )
  185. job.append( "logging.basicConfig( format='%(lineno)-4s:%(filename)-30s [%(levelname)-8s] %(message)s', level=logging.INFO, stream=sys.stdout )\n" )
  186. job.append( '\n' )
  187. job.append( '# prepend locations of python modules to search path:\n' )
  188. job.append( "sys.path.insert( 0, '%s' )\n" % bindir )
  189. job.append( '\n' )
  190. job.append( '# tools:\n' )
  191. job.append( 'import submit_tm5_tools\n' )
  192. job.append( '\n' )
  193. job.append( '# current loadleveler steps:\n' )
  194. job.append( 'step_name = os.getenv("LOADL_STEP_NAME")\n' )
  195. job.append( '\n' )
  196. # HOSTFILE - Moved here from Command_Line for loadleveler.
  197. hostfile = rcf.get('mpirun.hostfile',default='') # calling script will create hostfile
  198. if (len(hostfile) > 0) and rcf.get('par.mpi','bool') :
  199. job.append( 'f = open("%s",\'w\') \n' % hostfile )
  200. job.append( 'hname = socket.gethostname() \n')
  201. job.append( "for i in range( %s ) : \n" % ntasks )
  202. job.append( "\tf.write( '%s\\n' % hname)\n" )
  203. job.append( 'f.close() \n')
  204. job.append( '\n' )
  205. # job step names:
  206. steps = rcf.get('job.steps').split(' ')
  207. # number of steps:
  208. nstep = len(steps)
  209. # loop over job steps:
  210. for istep in range(nstep) :
  211. # current:
  212. step = steps[istep]
  213. # next:
  214. if istep < nstep-1 : step_next = steps[istep+1]
  215. # list with queue option lines for this step:
  216. qopt_step = QueueOptions( bname, rcf, step )
  217. # call to actual script:
  218. if step == 'run' :
  219. # get command line:
  220. exe = os.path.join( os.curdir, rcf.get('job.step.%s.exe' % step) )
  221. args = rcfile
  222. indb = rcf.get('submit.debugger','bool')
  223. cmndline = Command_Line( rcf, exe, args, indb )
  224. # <script> <commandline>
  225. step_command = '["%s/submit_tm5_step_%s","%s"]' % (bindir,step,cmndline)
  226. else :
  227. # <script> <rcfile>
  228. step_command = '["%s/submit_tm5_step_%s","%s","--bindir=%s"]' % (bindir,step,rcfile,bindir)
  229. #endif
  230. # add queue options to destintation:
  231. if with_master_job :
  232. # add to queue options for master job:
  233. qopt = qopt + qopt_step
  234. # add lines to run the step:
  235. job.append( '# which step ?\n' )
  236. job.append( 'if step_name == "%s" :\n' % step )
  237. job.append( ' \n' )
  238. job.append( ' # run:\n' )
  239. job.append( ' retcode = subprocess.call( %s )\n' % step_command )
  240. job.append( ' if retcode != 0 :\n' )
  241. job.append( ' logging.error( sys.exc_info()[1] )\n' )
  242. job.append( ' logging.error( \'exception from subprocess call to : %s\' )\n' % step_command )
  243. job.append( ' sys.exit(1)\n' )
  244. job.append( ' #endif\n' )
  245. job.append( ' \n' )
  246. # last step ? then add lines to submit next job:
  247. if istep == nstep-1 :
  248. job.append( ' # write and submit next job if necessary:\n' )
  249. job.append( ' submit_tm5_tools.WriteAndSubmitNewJob( "%s", "%s" )\n' % (rcfile,bindir) )
  250. job.append( ' \n' )
  251. #endif
  252. # close step:
  253. job.append( '#endif\n' )
  254. job.append( '\n' )
  255. else : # no master job, but seperate files
  256. # name of step job to be written:
  257. step_job_template = bname+'_%s.jb'
  258. # actual name:
  259. step_job = step_job_template % step
  260. # open:
  261. f = open( step_job, 'w' )
  262. # write header:
  263. f.write( '#! %s\n' % job_shell )
  264. f.write( '\n' )
  265. # add queue options:
  266. for line in qopt_step : f.write(line)
  267. # add lines to call the actual script:
  268. f.write( '# external:\n' )
  269. f.write( 'import sys\n' )
  270. f.write( 'import os\n' )
  271. f.write( 'import logging\n' )
  272. f.write( 'import socket\n' )
  273. f.write( 'import subprocess\n' )
  274. f.write( '\n' )
  275. f.write( '# go to run directory:\n' )
  276. f.write( 'os.chdir("%s")\n' % os.getcwd() )
  277. f.write( '\n' )
  278. f.write( '# prepend locations of python modules to search path:\n' )
  279. f.write( "sys.path.insert( 0, '%s' )\n" % bindir )
  280. f.write( '\n' )
  281. f.write( '# tools:\n' )
  282. f.write( 'import rc\n' )
  283. f.write( 'import submit_tm5_tools\n' )
  284. f.write( '\n' )
  285. f.write( '# setup messages:\n' )
  286. f.write( "logging.basicConfig( format='%(lineno)-4s:%(filename)-30s [%(levelname)-8s] %(message)s', level=logging.INFO, stream=sys.stdout )\n" )
  287. f.write( '\n' )
  288. f.write( '# info ...\n' )
  289. f.write( 'logging.info( "start" )\n' )
  290. f.write( '\n' )
  291. # module command if needed
  292. module_cmd = rcf.get('module.cmd',default='')
  293. if (module_cmd != '') and (step == 'run'):
  294. f.write( 'mod_cmd="%s".split()\n' % module_cmd )
  295. f.write( 'retcode = subprocess.call( mod_cmd )\n' )
  296. f.write( 'if retcode != 0:\n' )
  297. f.write( ' logging.error( sys.exc_info()[1] )\n' )
  298. f.write( ' logging.error( \'exception from subprocess call to : %s\' )\n' % module_cmd )
  299. f.write( ' sys.exit(1)\n' )
  300. f.write( '\n' )
  301. # openMP
  302. if ( (rcf.get('queue') == 'slurm') or (rcf.get('queue') == 'pbs')) and \
  303. rcf.get('par.openmp','bool') and (step == 'run'):
  304. nthread = rcf.get( 'par.nthread', 'int' )
  305. f.write( "os.putenv( \'OMP_NUM_THREADS\', '%s')\n" % nthread )
  306. # HOSTFILE - Moved here (and adapted) from Command_Line for case of
  307. # "SLURM with a host file" (case of NEURON@KNMI, and not CARTESIUS@SARA).
  308. # Needed only if MPI, step run.
  309. hostfile = rcf.get('mpirun.hostfile',default='')
  310. if (len(hostfile) > 0) and (rcf.get('queue') == 'slurm') and rcf.get('par.mpi','bool') and (step == 'run'):
  311. f.write( 'f = open("%s",\'w\') \n' % hostfile)
  312. # PREVIOUS:
  313. #f.write( 'hname = socket.gethostname() \n')
  314. #f.write( 'for i in range(ntask) : f.write( '%s' % hname ) \n')
  315. # NOW: for SLURM on NEURON, need nodelist and number of
  316. # tasks-per-node effectively allocated.
  317. # --Idee #1
  318. # -- Use "output environment variables" to gather effective values:
  319. #
  320. #f.write( 'for node in os.getenv('SLURM_NODELIST'): \n')
  321. #f.write( ' for i in range(os.getenv('SLURM_NTASKS_PER_NODE'): f.write( '%s\n' % node ) \n')
  322. #
  323. # -- BUT that would not work in all imaginable cases... so we
  324. # -- should use SLURM_TASKS_PER_NODE instead, but its format, for eg:
  325. # -- 2(x3),1,5,2(x5)
  326. # -- is a bit annoying to process. Then idee #2... good old unix command line:
  327. # --Idee #2 - Use srun | sort | awk to get ordered list
  328. hcommand = '[ "srun -l /bin/hostname | sort -n | awk \'{print $2}\' > %s"]' % hostfile
  329. f.write( 'retcode = subprocess.call( %s, shell=True )\n' % hcommand )
  330. f.write( 'if retcode != 0:\n' )
  331. f.write( ' logging.error( sys.exc_info()[1] )\n' )
  332. f.write( ' logging.error( \'exception from creating host file\' )\n' )
  333. f.write( ' sys.exit(1)\n' )
  334. f.write( 'f.close() \n')
  335. f.write( '\n' )
  336. f.write( '# call user script:\n' )
  337. f.write( 'retcode = subprocess.call( %s )\n' % step_command )
  338. f.write( 'if retcode != 0:\n' )
  339. f.write( ' logging.error( sys.exc_info()[1] )\n' )
  340. f.write( ' logging.error( \'exception from subprocess call to : %s\' )\n' % step_command )
  341. f.write( ' sys.exit(1)\n' )
  342. f.write( '#endif\n' )
  343. f.write( '\n' )
  344. # add submission of next step?
  345. if istep < nstep-1 :
  346. # job script of next step:
  347. step_job_next = step_job_template % step_next
  348. # add submission command:
  349. f.write( '# submit next step:\n' )
  350. f.write( 'try :\n' )
  351. f.write( ' submit_tm5_tools.SubmitJob( "%s", "%s" )\n' % (step_job_next,rcfile) )
  352. f.write( 'except:\n' )
  353. f.write( ' logging.error( sys.exc_info()[1] )\n' )
  354. f.write( ' logging.error( \'exception from SubmitJob( "%s", "%s" )\' )\n' % (step_job_next,rcfile) )
  355. f.write( ' sys.exit(1)\n' )
  356. f.write( '#endtry\n' )
  357. f.write( '\n' )
  358. else :
  359. # last step; might be necessary to submit a new job:
  360. f.write( '# write and submit next job if necessary:\n' )
  361. f.write( 'submit_tm5_tools.WriteAndSubmitNewJob( "%s", "%s" )\n' % (rcfile,bindir) )
  362. f.write( '\n' )
  363. f.write( '# info ...\n' )
  364. f.write( 'logging.info( "end" )\n' )
  365. f.write( '\n' )
  366. f.close()
  367. # make it executable and readible for all, writable for user only:
  368. # u+r u+w u+x g+r g-w g+x o+r o-w o+x
  369. os.chmod( step_job, 2**8 + 2**7 + 2**6 + 2**5 + 0 + 2**3 + 2**2 + 0 + 2**0 )
  370. # fill return value:
  371. if istep == 0 : jobfile = step_job
  372. # write master job:
  373. if with_master_job :
  374. # combine:
  375. job = header + qopt + job
  376. # write:
  377. f = open(jobfile,'w')
  378. f.writelines(job)
  379. f.close()
  380. # make it executable and readible for all, writable for user only:
  381. # u+r u+w u+x g+r g-w g+x o+r o-w o+x
  382. os.chmod( jobfile, 2**8 + 2**7 + 2**6 + 2**5 + 0 + 2**3 + 2**2 + 0 + 2**0 )
  383. return jobfile
  384. # ***
  385. def WriteAndSubmitBuildJob( rcf, command ):
  386. """
  387. Write bash script that run 'command' after being submitted to the
  388. queue. Written to compile TM5 in the queue.
  389. """
  390. import os
  391. orig_val = rcf.get('submit.to')
  392. dummy = rcf.replace('submit.to','queue')
  393. source_dir = os.getcwd() # assume that it is called from
  394. # source directory
  395. jobfile = os.path.join(source_dir,'build.jb')
  396. f = open( jobfile, 'w' )
  397. f.write( '#! /bin/bash\n' )
  398. f.write( '\n' )
  399. qopt = QueueOptions( 'buildTM', rcf, 'build' )
  400. for line in qopt : f.write(line)
  401. f.write( "cd %s \n" % source_dir )
  402. f.write( '%s' % ' '.join(command) )
  403. f.write( '\n' )
  404. f.close()
  405. id=SubmitJob( jobfile, rcf )
  406. dummy = rcf.replace('submit.to', orig_val)
  407. return
  408. # ***
  409. def QueueOptions( bname, rcf, step ) :
  410. """
  411. Return list with queue option lines.
  412. """
  413. # modules:
  414. import logging
  415. # submit to queue ?
  416. if rcf.get('submit.to') == 'queue' :
  417. # queue type:
  418. queue = rcf.get('queue')
  419. # different options and commands:
  420. if queue == 'loadleveler' :
  421. qopt = QueueOptions_LoadLeveler( bname, rcf, step )
  422. elif queue == 'bsub' :
  423. qopt = QueueOptions_BSub( bname, rcf, step )
  424. elif queue == 'qsub' :
  425. qopt = QueueOptions_QSub( bname, rcf, step )
  426. elif queue == 'pbs' :
  427. qopt = QueueOptions_PBS( bname, rcf, 'all' ) # init result with the "all" step
  428. qopt = qopt + QueueOptions_PBS( bname, rcf, step )
  429. elif queue == 'slurm' :
  430. qopt = QueueOptions_Slurm( bname, rcf, step )
  431. else :
  432. # not supported ...
  433. logging.error( 'unsupported queue : %s' % queue )
  434. raise Exception
  435. else:
  436. qopt = []
  437. return qopt
  438. # ***
  439. def SubmitJob( job_script, rcfile ) :
  440. """
  441. Submit jobscript. Where to submit to (foreground,background, queue) is
  442. read from rcfile settings. Returns jobid if submitting to queue (for now
  443. dummy for all cases except PBS).
  444. """
  445. import sys
  446. import logging
  447. import rc
  448. jobid='not-a-real-jobid-yet' # default
  449. # settings
  450. if type(rcfile) == str :
  451. rcf = rc.RcFile( rcfile )
  452. else :
  453. rcf = rcfile
  454. # where to ?
  455. submit_to = rcf.get('submit.to')
  456. # info ...
  457. logging.info( 'submit %s to %s ...' % (job_script,submit_to) )
  458. # call specific submit routines:
  459. if submit_to == 'foreground' :
  460. # call run script, catch errors:
  461. try :
  462. Run_Job_In_Foreground( job_script )
  463. except :
  464. logging.error( sys.exc_info()[1] )
  465. logging.error( 'from Run_Job_In_Foreground for %s' % job_script )
  466. raise Exception
  467. elif submit_to == 'background' :
  468. # call run script, catch errors:
  469. try :
  470. Submit_Job_To_Background( job_script, rcf )
  471. except :
  472. logging.error( 'from Submit_Job_To_Background for %s' % job_script )
  473. raise Exception
  474. elif submit_to == 'queue' :
  475. # queue type:
  476. queue = rcf.get('queue')
  477. # different options and commands:
  478. if queue == 'loadleveler' :
  479. Submit_Job_To_LoadLeveler( job_script, rcf )
  480. elif queue == 'bsub' :
  481. Submit_Job_To_BSub( job_script, rcf )
  482. elif queue == 'qsub' :
  483. Submit_Job_To_QSub( job_script, rcf )
  484. elif queue == 'pbs' :
  485. jobid = Submit_Job_To_PBS( job_script, rcf )
  486. elif queue == 'slurm' :
  487. Submit_Job_To_Slurm( job_script, rcf )
  488. else :
  489. # not supported ...
  490. logging.error( 'unsupported queue : %s' % queue )
  491. raise Exception
  492. else :
  493. # not supported ...
  494. logging.error( 'unsupported run environment : %s' % submit_to )
  495. sys.exit(1)
  496. return jobid
  497. # ======================================================================
  498. # ===
  499. # === foreground
  500. # ===
  501. # ======================================================================
  502. def Run_Job_In_Foreground( job_script ) :
  503. """
  504. Run job script in foreground.
  505. """
  506. # external:
  507. import sys
  508. import os
  509. import logging
  510. import subprocess
  511. # setup command line, e.g. './myscript.jb' :
  512. command = os.path.join(os.curdir,job_script)
  513. # execute:
  514. retcode = subprocess.call( command )
  515. if retcode != 0 :
  516. logging.error( sys.exc_info()[1] )
  517. logging.error( 'from subprocess call to : %s' % command )
  518. raise Exception
  519. return
  520. # ======================================================================
  521. # ===
  522. # === background
  523. # ===
  524. # ======================================================================
  525. def Submit_Job_To_Background( job_script, rcf ) :
  526. """
  527. Submit job to background.
  528. """
  529. # external:
  530. import sys
  531. import os
  532. import logging
  533. import subprocess
  534. # basename for scripts etc is name of rcfile minus extension:
  535. bname,ext = os.path.splitext(job_script)
  536. # output files:
  537. job_stdout = bname+'.out'
  538. job_stderr = bname+'.err'
  539. job_info = bname+'.info'
  540. # setup command line, e.g. './myscript.jb' :
  541. command = os.path.join(os.curdir,job_script)
  542. # re-direct standard output:
  543. command = command+' > %s' % job_stdout
  544. # write error messages to seperate file:
  545. #command = command+' 2> %s' % job_stderr
  546. command = command+' 2>&1'
  547. # run in background, return process id:
  548. logging.info( 'run shell command : "%s" ...' % command )
  549. p = subprocess.Popen( command, shell=True )
  550. # info ...
  551. infotext = []
  552. infotext.append( '\n' )
  553. infotext.append( 'Summary:\n' )
  554. infotext.append( '\n' )
  555. infotext.append( ' job script : %s\n' % job_script )
  556. infotext.append( ' standard output : %s\n' % job_stdout )
  557. infotext.append( ' standard error : %s\n' % job_stderr )
  558. infotext.append( '\n' )
  559. infotext.append( 'Process snapshot:\n')
  560. infotext.append( '\n')
  561. p2 = subprocess.Popen( '/bin/ps -f -p %i' % p.pid, shell=True,
  562. stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  563. for line in p2.stdout.readlines() : infotext.append( line )
  564. infotext.append( '\n')
  565. infotext.append( 'To manage this process:\n' )
  566. infotext.append( '\n' )
  567. infotext.append( ' # show process snapshot:\n' )
  568. infotext.append( ' ps -f -p %i\n' % p.pid )
  569. infotext.append( ' \n' )
  570. infotext.append( ' # kill process:\n' )
  571. infotext.append( ' kill %i\n' % p.pid )
  572. infotext.append( ' \n' )
  573. infotext.append( ' # follow standard output:\n' )
  574. infotext.append( ' tail -f %s\n' % job_stdout )
  575. infotext.append( '\n' )
  576. # write to log:
  577. for line in infotext : logging.info( line.strip() )
  578. # write to file:
  579. f = open( job_info, 'w' )
  580. f.writelines(infotext)
  581. f.close()
  582. return
  583. # ======================================================================
  584. # ===
  585. # === LoadLeveler queue
  586. # ===
  587. # ======================================================================
  588. def QueueOptions_LoadLeveler( bname, rcf, step ) :
  589. """
  590. Return list with queue options.
  591. """
  592. # external:
  593. import math
  594. # init result:
  595. qopt = []
  596. # which step ?
  597. if step == 'default' :
  598. # list with options:
  599. opts = rcf.get( 'queue.ll.options.%s' % step ).split()
  600. # default options:
  601. for opt in opts :
  602. # get value:
  603. val = rcf.get( 'queue.ll.option.%s.%s' % (step,opt) )
  604. # write:
  605. qopt.append( '#@ %-20s = %s\n' % (opt,val) )
  606. #endfor
  607. # layout ...
  608. qopt.append( '\n' )
  609. else :
  610. # list with options:
  611. opts = rcf.get( 'queue.ll.options.%s' % step ).split()
  612. # default options:
  613. for opt in opts :
  614. # get value:
  615. val = rcf.get( 'queue.ll.option.%s.%s' % (step,opt) )
  616. # to be set ?
  617. if val == '<auto>' :
  618. # differs per option ...
  619. if opt == 'output' :
  620. val = '%s_%s.out' % (bname,step)
  621. elif opt == 'error' :
  622. val = '%s_%s.err' % (bname,step)
  623. elif opt == 'tasks_per_node' :
  624. t_mpi = rcf.get( 'par.ntask', 'int' )
  625. t_omp = rcf.get( 'par.nthread', 'int' )
  626. val = str(t_omp*t_mpi)
  627. #endif
  628. #endif
  629. # no, empty, or normal value ?
  630. if val == '<none>' :
  631. # skip this keyword:
  632. continue
  633. elif val == '' :
  634. # just the keyword:
  635. qopt.append( '#@ %s\n' % opt )
  636. else :
  637. # keyword and value:
  638. qopt.append( '#@ %-20s = %s\n' % (opt,val) )
  639. #endif
  640. #endfor
  641. # layout ...
  642. qopt.append( '\n' )
  643. return qopt
  644. # ***
  645. def Submit_Job_To_LoadLeveler( job_script, rcf ) :
  646. """
  647. Submit job to LoadLeveler queue.
  648. """
  649. # external:
  650. import sys
  651. import os
  652. import logging
  653. import subprocess
  654. # basename for scripts etc is name of rcfile minus extension:
  655. bname,ext = os.path.splitext(job_script)
  656. # output files:
  657. job_info = bname+'.info'
  658. # options passed directly to submission command:
  659. qopts = rcf.get( 'queue.ll.submit.options' )
  660. # add options passed to submit script:
  661. qopts = qopts+' '+rcf.get('submit.options')
  662. # info ...
  663. logging.info( ' launch ...' )
  664. # setup command line:
  665. command = 'llsubmit '+qopts
  666. # last argument is script:
  667. command = command+' '+job_script
  668. # info ...
  669. logging.info( ' command: %s' % command )
  670. # init submission info file:
  671. infotext = []
  672. infotext.append( '\n' )
  673. # call submit command, trap errors:
  674. try:
  675. # submit; redirect errors to standard output:
  676. p = subprocess.Popen( command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  677. except :
  678. logging.error( sys.exc_info()[1] )
  679. logging.error( 'from subprocess.Popen( %s )' % command.split() )
  680. raise Exception
  681. #endtry
  682. # extract:
  683. outlines = p.stdout.readlines()
  684. # add to help info message:
  685. infotext = infotext + outlines
  686. # extract job id from last line:
  687. # llsubmit: The job "c1a0303.4290133" with 3 job steps has been submitted.
  688. firstwords = 'llsubmit: The job'
  689. lastline = outlines[-1]
  690. if lastline.startswith(firstwords) :
  691. job_id = lastline.lstrip(firstwords).split()[0].replace('"','')
  692. else :
  693. job_id = '<job-id>'
  694. #endif
  695. # add help text to submission info:
  696. infotext.append( '\n' )
  697. infotext.append( 'To manage LoadLeveler jobs:\n' )
  698. infotext.append( '\n' )
  699. infotext.append( ' llq [-u ${USER}] # list [your] current jobs\n' )
  700. infotext.append( ' llq %s # list this job\n' % job_id )
  701. infotext.append( ' llcancel %s # kill this job\n' % job_id )
  702. infotext.append( '\n' )
  703. # write to log:
  704. for line in infotext : logging.info( line.rstrip() )
  705. # write to file:
  706. f = open( job_info, 'w' )
  707. f.writelines(infotext)
  708. f.close()
  709. # ok
  710. return
  711. #enddef
  712. # ======================================================================
  713. # ===
  714. # === BSUB queue
  715. # ===
  716. # ======================================================================
  717. def QueueOptions_BSub( bname, rcf, step ) :
  718. """
  719. Return list with queue options.
  720. """
  721. # init result:
  722. qopt = []
  723. # List of options for specified step:
  724. opts = rcf.get( 'queue.bsub.options.%s' % step ).split()
  725. # loop over options:
  726. for opt in opts :
  727. # get value:
  728. val = rcf.get( 'queue.bsub.option.%s.%s' % (step,opt) )
  729. # default options:
  730. if val == '<auto>' :
  731. if opt in ['o','oo'] :
  732. val = '%s_%s.out' % (bname,step)
  733. elif opt in ['e','eo'] :
  734. val = '%s_%s.err' % (bname,step)
  735. #endif
  736. #endif # <auto> value
  737. # define option key:
  738. # queue.bsub.options.R : 20 -> -R 20
  739. # queue.bsub.options.Rx : -R 20 -> -R 20
  740. if val.startswith('-') :
  741. qopt.append( '#BSUB %s\n' % (val) )
  742. else :
  743. qopt.append( '#BSUB -%s %s\n' % (opt,val) )
  744. #endif
  745. #endfor # opts
  746. # layout ...
  747. qopt.append( '\n' )
  748. # ok
  749. return qopt
  750. #enddef QueueOptions_BSub
  751. # ***
  752. def Submit_Job_To_BSub( job_script, rcf ) :
  753. """
  754. Submit job to BSUB queue.
  755. """
  756. # external:
  757. import os
  758. import logging
  759. import subprocess
  760. # basename for scripts etc is name of rcfile minus extension:
  761. bname,ext = os.path.splitext(job_script)
  762. # output files:
  763. job_info = bname+'.info'
  764. # options passed directly to submission command:
  765. qopts = rcf.get( 'queue.bsub.submit.options' )
  766. # add options passed to submit script:
  767. qopts = qopts+' '+rcf.get('submit.options')
  768. # info ...
  769. logging.info( ' launch ...' )
  770. # setup command line:
  771. command = 'bsub '+qopts
  772. # pass job script to std.input:
  773. command = command+' < '+job_script
  774. # info ...
  775. logging.info( ' command: %s' % command )
  776. # prepare for OS errors (file does not exist etc.)
  777. try:
  778. # submit; redirect errors to standard output:
  779. p = subprocess.Popen( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  780. except OSError as err :
  781. logging.error( 'OSError: '+err.strerror )
  782. logging.error( 'from call : %s' % command )
  783. logging.error( 'directory : %s' % os.getcwd() )
  784. raise Exception
  785. #endtry
  786. # extract:
  787. outlines = p.stdout.readlines()
  788. # display:
  789. for line in outlines : logging.info( ' %s' % line.rstrip() )
  790. # standard output is:
  791. # <jobname> <jobnr>
  792. # extract job nr:
  793. job_nr = outlines[0].split()[1].strip('<>')
  794. # info ...
  795. infotext = []
  796. infotext.append( '\n' )
  797. infotext.append( 'Summary:\n' )
  798. infotext.append( '\n' )
  799. infotext.append( ' current dir : %s\n' % os.getcwd() )
  800. infotext.append( ' job script : %s\n' % job_script )
  801. infotext.append( '\n' )
  802. infotext.append( 'To manage this job:\n' )
  803. infotext.append( ' \n' )
  804. infotext.append( ' # kill job:\n' )
  805. infotext.append( ' bkill %s\n' % job_nr )
  806. infotext.append( ' \n' )
  807. infotext.append( 'To show all your running jobs:\n' )
  808. infotext.append( ' \n' )
  809. infotext.append( ' bjobs\n' )
  810. infotext.append( ' \n' )
  811. # write to log:
  812. for line in infotext : logging.info( line.rstrip() )
  813. # write to file:
  814. f = open( job_info, 'w' )
  815. f.writelines(infotext)
  816. f.close()
  817. # ok
  818. return
  819. #enddef
  820. # ======================================================================
  821. # ===
  822. # === QSUB queue
  823. # ===
  824. # ======================================================================
  825. def QueueOptions_QSub( bname, rcf, step ) :
  826. """
  827. Return list with queue options.
  828. """
  829. # init result:
  830. qopt = []
  831. # list with options:
  832. opts = rcf.get( 'queue.qsub.options' ).split()
  833. append_jobstep = rcf.get('queue.qsub.name.append.jobstep',default=False)
  834. # default options:
  835. for opt in opts :
  836. # look first for a jobstep-specific qsub option
  837. val = rcf.get( 'queue.qsub.option.%s.%s' % (opt,step), default='STEP_SPECIFIC_KEY_MISSING' )
  838. # if jobstep-specific option is missing, look for one without jobstep
  839. if val == 'STEP_SPECIFIC_KEY_MISSING':
  840. val = rcf.get( 'queue.qsub.option.%s' % opt)
  841. if (opt == 'N') and append_jobstep:
  842. jobstep = rcf.get('jobstep')
  843. val = val + '_%03d' % (int(jobstep))
  844. # fill option line:
  845. qopt.append( '#PBS -%s %s\n' % (opt,val) )
  846. #endfor
  847. # layout ...
  848. qopt.append( '\n' )
  849. # ok
  850. return qopt
  851. #enddef
  852. # ***
  853. def Submit_Job_To_QSub( job_script, rcf ) :
  854. """
  855. Submit job to QSUB queue.
  856. """
  857. # external:
  858. import os
  859. import logging
  860. import subprocess
  861. # basename for scripts etc is name of rcfile minus extension:
  862. bname,ext = os.path.splitext(job_script)
  863. # output files:
  864. job_info = bname+'.info'
  865. # options passed directly to submission command:
  866. qopts = rcf.get( 'queue.qsub.submit.options' )
  867. # add options passed to submit script:
  868. qopts = qopts+' '+rcf.get('submit.options')
  869. # info ...
  870. logging.info( ' launch ...' )
  871. # setup command line:
  872. command = 'qsub '+qopts
  873. # last argument is script:
  874. command = command+' '+job_script
  875. # info ...
  876. logging.info( ' command: %s' % command )
  877. # prepare for OS errors (file does not exist etc.)
  878. try:
  879. # submit; redirect errors to standard output:
  880. p = subprocess.Popen( command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  881. except OSError as err :
  882. logging.error( 'OSError: '+err.strerror )
  883. logging.error( 'from call : %s' % command )
  884. logging.error( 'directory : %s' % os.getcwd() )
  885. raise Exception
  886. #endtry
  887. # extract:
  888. outlines = p.stdout.readlines()
  889. # display:
  890. for line in outlines : logging.info( ' %s' % line.rstrip() )
  891. # standard output is:
  892. # jobnr
  893. # extract job nr:
  894. job_nr = outlines[0].split()[0]
  895. # info ...
  896. infotext = []
  897. infotext.append( '\n' )
  898. infotext.append( 'Summary:\n' )
  899. infotext.append( '\n' )
  900. infotext.append( ' current dir : %s\n' % os.getcwd() )
  901. infotext.append( ' job script : %s\n' % job_script )
  902. infotext.append( '\n' )
  903. infotext.append( 'To manage this job:\n' )
  904. infotext.append( ' \n' )
  905. infotext.append( ' # kill job:\n' )
  906. infotext.append( ' qdel %s\n' % job_nr )
  907. infotext.append( ' \n' )
  908. infotext.append( 'To show all your running jobs:\n' )
  909. infotext.append( ' \n' )
  910. infotext.append( ' qstat [-u ${USER}]\n' )
  911. infotext.append( ' \n' )
  912. # write to log:
  913. for line in infotext : logging.info( line.rstrip() )
  914. # write to file:
  915. f = open( job_info, 'w' )
  916. f.writelines(infotext)
  917. f.close()
  918. # ok
  919. return
  920. # ======================================================================
  921. # ===
  922. # === PBS queue (PBSpro)
  923. # ===
  924. # === Note that there are several implementations of PBS and the "pbs queue"
  925. # === here, although it also uses the qsub command, differs from the "qsub
  926. # === queue" defined above. The later uses #$ directives, while the pbsPro
  927. # === directives start with #PBS.
  928. # ===
  929. # ======================================================================
  930. def QueueOptions_PBS( bname, rcf, step ) :
  931. """
  932. Return list with queue options. To call twice, first for pseudo-step
  933. 'all'. See QueueOptions above.
  934. """
  935. import os
  936. qopt = [] # init result
  937. opts = rcf.get( 'queue.pbs.options.%s' % step ).split() # list with options
  938. # use fully qualified filename for log files
  939. if step != 'build':
  940. fqpath = rcf.get('rundir')
  941. else:
  942. fqpath = rcf.get('build.sourcedir')
  943. fqname = os.path.join( fqpath, bname )
  944. # build list of job directives
  945. for opt in opts :
  946. val = rcf.get( 'queue.pbs.option.%s.%s' % (step,opt) )
  947. # deal w/ multiple options with -l
  948. if opt == 'l':
  949. l_vals = val.split()
  950. for lval in l_vals: qopt.append( '#PBS -l %s\n' % lval )
  951. continue
  952. # options still to be set
  953. if val == '<auto>' :
  954. if opt == 'o' :
  955. val = '%s_%s.out' % (fqname,step)
  956. elif opt == 'e' :
  957. val = '%s_%s.err' % (fqname,step)
  958. # none, empty or plain normal value ?
  959. if val == '<none>' :
  960. continue # skip this option
  961. elif val == '' :
  962. qopt.append( '#PBS -%s\n' % opt ) # just the keyword
  963. else :
  964. qopt.append( '#PBS -%s %s\n' % (opt,val) ) # keyword and value
  965. qopt.append( '\n' )
  966. return qopt
  967. def Submit_Job_To_PBS( job_script, rcf ) :
  968. """
  969. Submit job to PBS queue.
  970. """
  971. import os
  972. import logging
  973. import subprocess
  974. # basename for scripts etc is name of rcfile minus extension:
  975. bname,ext = os.path.splitext(job_script)
  976. # output files:
  977. job_info = bname+'.info'
  978. # Two set of options to pass directly unmodified at the submission command :
  979. qopts = rcf.get( 'queue.pbs.submit.options' ) # (1) from machine or queue (if separated from machine) rcfile
  980. qopts = qopts+' '+rcf.get('submit.options') # (2) from expert rcfile
  981. # info ...
  982. logging.info( ' launch ...' )
  983. # setup command line:
  984. command = 'qsub '+qopts
  985. # last argument is script:
  986. command = command+' '+job_script
  987. # info ...
  988. logging.info( ' command: %s' % command )
  989. # prepare for OS errors (file does not exist etc.)
  990. try:
  991. # submit; redirect errors to standard output:
  992. p = subprocess.Popen( command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  993. except OSError as err :
  994. logging.error( 'OSError: '+err.strerror )
  995. logging.error( 'from call : %s' % command )
  996. logging.error( 'directory : %s' % os.getcwd() )
  997. raise Exception
  998. # log output
  999. outlines = p.stdout.readlines()
  1000. for line in outlines : logging.info( ' %s' % line.rstrip() )
  1001. # standard output is:
  1002. # jobnr
  1003. # extract job nr:
  1004. job_nr = outlines[0].split()[0]
  1005. # info ...
  1006. infotext = []
  1007. infotext.append( '\n' )
  1008. infotext.append( 'Summary:\n' )
  1009. infotext.append( '\n' )
  1010. infotext.append( ' current dir : %s\n' % os.getcwd() )
  1011. infotext.append( ' job script : %s\n' % job_script )
  1012. infotext.append( '\n' )
  1013. infotext.append( 'To manage this job:\n' )
  1014. infotext.append( ' \n' )
  1015. infotext.append( ' # kill job:\n' )
  1016. infotext.append( ' qdel %s\n' % job_nr )
  1017. infotext.append( ' \n' )
  1018. infotext.append( 'To show all your running jobs:\n' )
  1019. infotext.append( ' \n' )
  1020. infotext.append( ' qstat [-u ${USER}]\n' )
  1021. infotext.append( ' \n' )
  1022. infotext.append( 'To monitor this running job (ECMWF only!):\n' )
  1023. infotext.append( ' \n' )
  1024. infotext.append( ' qcat %s\n' % job_nr )
  1025. infotext.append( ' \n' )
  1026. # write to log:
  1027. for line in infotext : logging.info( line.rstrip() )
  1028. # write to file:
  1029. f = open( job_info, 'w' )
  1030. f.writelines(infotext)
  1031. f.close()
  1032. return job_nr
  1033. # ======================================================================
  1034. # ===
  1035. # === SLURM queue
  1036. # ===
  1037. # ======================================================================
  1038. def QueueOptions_Slurm( bname, rcf, step ) :
  1039. """
  1040. Return list with queue options directives, that will be at the top of
  1041. script.
  1042. Script for run step (NEURON@KNMI) should contains:
  1043. #SBATCH -n ${par.nthread}*${ntask}
  1044. export OMP_NUM_THREADS=${par.nthread}
  1045. mpiexec.hydra -machinefile ./mpd.hosts -np ${ntask} ./$bin
  1046. """
  1047. this_jobname=""
  1048. ntasks=0
  1049. qopt = [] # init result
  1050. # List of options for specified step:
  1051. if rcf.has_key(( 'queue.slurm.options.%s' % step )) :
  1052. opts = rcf.get( 'queue.slurm.options.%s' % step ).split()
  1053. else:
  1054. opts = rcf.get( 'queue.slurm.options').split()
  1055. # NOAA-specific option
  1056. append_jobstep = rcf.get('queue.slurm.name.append.jobstep',default=False)
  1057. for opt in opts :
  1058. # look first for a step-specific slurm option
  1059. val = rcf.get( 'queue.slurm.option.%s.%s' % (step,opt), default='STEP_SPECIFIC_KEY_MISSING' )
  1060. # if step-specific option is missing, look for one without step
  1061. if (val == 'STEP_SPECIFIC_KEY_MISSING') :
  1062. # get value:
  1063. val = rcf.get( 'queue.slurm.option.%s' % opt, default='NOT_PRESENT' )
  1064. # default options:
  1065. if val == '<auto>' :
  1066. if opt in ['o','output'] :
  1067. val = '%s_%s.out' % (bname,step)
  1068. elif opt in ['e','error'] :
  1069. val = '%s_%s.err' % (bname,step)
  1070. if opt in ['J','job-name'] :
  1071. if append_jobstep:
  1072. jobstep = rcf.get('jobstep')
  1073. val = val + '_' + ( '%03d' % int(jobstep) )
  1074. this_jobname=val
  1075. if (opt == 'ntasks'):
  1076. ntasks=int(val)
  1077. # skip unset (empty) directives
  1078. if opt == 'p' or 'w':
  1079. if len(val)==0 : continue
  1080. # Some options are represented byonly a single-letter
  1081. # (e.g. J), and these require a single dash
  1082. # (e.g. -J <jobname>) but others have only a multi-letter
  1083. # invocation, and need two dashes (e.g. --qos=batch).
  1084. if len(opt) > 1 :
  1085. dashes = '--'
  1086. sep = '='
  1087. else :
  1088. dashes = '-'
  1089. sep = ' '
  1090. #endif
  1091. qopt.append( '#SBATCH %s%s%s%s\n' % (dashes,opt,sep,val) )
  1092. # layout ...
  1093. qopt.append( '\n' )
  1094. # ok
  1095. return qopt
  1096. def Submit_Job_To_Slurm( job_script, rcf ) :
  1097. """
  1098. Submit job to SLURM queue.
  1099. """
  1100. # external:
  1101. import os
  1102. import logging
  1103. import subprocess
  1104. # basename for scripts etc is name of rcfile minus extension:
  1105. bname,ext = os.path.splitext(job_script)
  1106. # output files:
  1107. job_info = bname+'.info'
  1108. # Two set of options to pass directly unmodified at the submission command :
  1109. qopts = rcf.get( 'queue.slurm.submit.options' ) # (1) from pycassso-queue-slurm.rc
  1110. qopts = qopts+' '+rcf.get('submit.options') # (2) from pycasso-tm5-expert.rc
  1111. # info ...
  1112. logging.info( ' launch ...' )
  1113. # setup command line:
  1114. command = 'sbatch '+qopts
  1115. # last argument is script:
  1116. command = command+' '+job_script
  1117. # info ...
  1118. logging.info( ' command: %s' % command )
  1119. # prepare for OS errors (file does not exist etc.)
  1120. try:
  1121. # submit; redirect errors to standard output:
  1122. p = subprocess.Popen( command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT )
  1123. except OSError as err :
  1124. logging.error( 'OSError: '+err.strerror )
  1125. logging.error( 'from call : %s' % command )
  1126. logging.error( 'directory : %s' % os.getcwd() )
  1127. raise Exception
  1128. # extract:
  1129. outlines = p.stdout.readlines()
  1130. # display:
  1131. for line in outlines : logging.info( ' %s' % line.rstrip() )
  1132. # standard output is:
  1133. # jobnr
  1134. # extract job nr:
  1135. job_nr = outlines[0].split()[3]
  1136. # info ...
  1137. infotext = []
  1138. infotext.append( '\n' )
  1139. infotext.append( 'Summary:\n' )
  1140. infotext.append( '\n' )
  1141. infotext.append( ' current dir : %s\n' % os.getcwd() )
  1142. infotext.append( ' job script : %s\n' % job_script )
  1143. infotext.append( '\n' )
  1144. infotext.append( 'To manage this job:\n' )
  1145. infotext.append( ' \n' )
  1146. infotext.append( ' # kill job:\n' )
  1147. infotext.append( ' scancel %s\n' % job_nr )
  1148. infotext.append( ' \n' )
  1149. infotext.append( 'To show all your running jobs:\n' )
  1150. infotext.append( ' \n' )
  1151. infotext.append( ' squeue [-u ${USER}]\n' )
  1152. infotext.append( ' \n' )
  1153. # write to log:
  1154. for line in infotext : logging.info( line.rstrip() )
  1155. # write to file:
  1156. f = open( job_info, 'w' )
  1157. f.writelines(infotext)
  1158. f.close()
  1159. return
  1160. # ======================================================================
  1161. # ===
  1162. # === end
  1163. # ===
  1164. # ======================================================================