restart_ensemble_experiment.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. import os
  2. import sys
  3. import glob
  4. import subprocess
  5. # List of variable to save for output
  6. PLASIM_VAR = ['tas','hfls','hfss']
  7. OCEAN_VAR = ['heata','fldoa','sst']
  8. LSG_VAR = ['fluxhea', 'tbound']
  9. PLASIM_VAR = ['time','lat','lon'] + PLASIM_VAR
  10. OCEAN_VAR = ['time','lat','lon', 'ls'] + OCEAN_VAR
  11. LSG_VAR = ['time','lat','lon','lev', 'wet'] + LSG_VAR
  12. # check the python version
  13. if float(sys.version[:3]) <= 3.6:
  14. print("This script require Python >= 3.6 !")
  15. print("Try:")
  16. print("\n\tmodule load python3\n\n")
  17. print("and restart this script again.")
  18. print("Aborting...")
  19. sys.exit(1)
  20. #check if ecfs utils are loaded
  21. user = os.getenv('USER')
  22. try:
  23. dummy = subprocess.run(['els', 'ectmp:/'+user+'/'], check=True, stdout=subprocess.PIPE)
  24. except:
  25. print("This script require the ECFS toolchain to be loaded !")
  26. print("Try:")
  27. print("\n\tmodule load ecfs\n\n")
  28. print("and restart this script again.")
  29. print("Aborting...")
  30. sys.exit(1)
  31. #check if netcdf utils are loaded
  32. try:
  33. dummy = subprocess.run(['ncdump'], check=True, stderr=subprocess.PIPE)
  34. except:
  35. print("This script require NetCDF4 toolchain to be loaded !")
  36. print("Try:")
  37. print("\n\tmodule load netcdf4\n\n")
  38. print("and restart this script again.")
  39. print("Aborting...")
  40. sys.exit(1)
  41. home_dir = os.getenv("HOME")
  42. scratch_dir = os.getenv("SCRATCH")
  43. perm_dir = os.getenv("PERM")
  44. plasim_dir = home_dir + "/PLASIM/"
  45. if len(sys.argv) < 5:
  46. print('Bad arguments:', sys.argv)
  47. print('Usage:')
  48. print('\n\t python3 restart_ensemble_experiment.py where experiment ensemble_size number_of_years where_to_save\n')
  49. print('Arguments:\n')
  50. print('\twhere :\t\t\tWhere the experiment ensemble folders are located.')
  51. print('\texperiment :\t\tName of the experiment.')
  52. print('\tensemble_size :\t\tSize of the ensemble.')
  53. print('\tnumber_of_years :\tNumber of years simulated by one run of the experiment.')
  54. print('\twhere_to_save :\t\tOptional. Where to backup the previous run. If not provide, uses $PERM.')
  55. sys.exit(0)
  56. where = sys.argv[1]
  57. basedir = where.split('/')[-1]
  58. experiment = sys.argv[2]
  59. ensemble_size = int(sys.argv[3])
  60. restart_year_to_save = sys.argv[4]
  61. try:
  62. save = sys.argv[5]
  63. except:
  64. save = perm_dir
  65. save_light = save + "/"+experiment+"_light/"
  66. save += "/"+experiment+"/"
  67. # Check if the experiment folder exists
  68. experiment_folder = where+"/"+experiment+"/"
  69. if not os.path.isdir(experiment_folder):
  70. print("Experiment folder not found!")
  71. print("Create and start the experiment "+experiment+" first.")
  72. print("Aborting...")
  73. sys.exit(1)
  74. # check if the experiment is still running
  75. queue = subprocess.run(['ssh', 'cca', '/opt/pbs/13.0.403.161593/bin/qstat', '-u', user], stdout=subprocess.PIPE, timeout=360)
  76. #if 'plasim_'+experiment[:3] in str(queue.stdout):
  77. # print("Experiment still running on cca, no need to restart!")
  78. # print("Aborting...")
  79. # sys.exit(1)
  80. queue = subprocess.run(['ssh', 'ccb', '/opt/pbs/13.0.403.161593/bin/qstat', '-u', user], stdout=subprocess.PIPE, timeout=360)
  81. #if 'plasim_'+experiment[:3] in str(queue.stdout):
  82. # print("Experiment still running on ccb, no need to restart!")
  83. # print("Aborting...")
  84. # sys.exit(1)
  85. #print('Saving the results of the previous run...')
  86. #os.system('mkdir -p '+save)
  87. #os.system('mkdir -p '+save_light)
  88. #os.system('mkdir -p '+scratch_dir+'/tmp/'+experiment)
  89. #for i in range(1, ensemble_size+1):
  90. # member_number = str(i).rjust(2, '0')
  91. # save_experiment_folder = save + 'run_'+experiment+'_'+member_number
  92. # save_experiment_folder_light = save_light + 'run_'+experiment+'_'+member_number
  93. # os.system('mkdir -p ' + save_experiment_folder)
  94. # # list all the past runs saved
  95. # past_run_list = os.listdir(save_experiment_folder)
  96. # if len(past_run_list) == 0:
  97. # past_run_list = [0]
  98. # else:
  99. # past_run_list = list(map(int, past_run_list))
  100. # past_run_list.sort()
  101. # last_experiment_index = past_run_list[-1]
  102. # new_experiment_index = last_experiment_index + 1
  103. # # temporarily move previous experiment run to scratch
  104. # if last_experiment_index > 0:
  105. # to_ecfs = save_experiment_folder+'/'+str(last_experiment_index)
  106. # to_light = save_experiment_folder_light+'/'+str(last_experiment_index)
  107. # os.system('mkdir -p '+scratch_dir+'/tmp/'+experiment + '/run_'+experiment+'_'+member_number)
  108. # os.system('rsync -a '+to_ecfs+' '+scratch_dir+'/tmp/'+experiment + '/run_'+experiment+'_'+member_number)
  109. # os.system('rm -rf '+to_ecfs)
  110. # os.system('rm -rf '+to_light)
  111. # # make the directory to save the result of the last experiment
  112. # dest = save_experiment_folder+'/'+str(new_experiment_index)
  113. # os.system('mkdir -p '+dest)
  114. # os.system('mkdir -p '+dest+'/restart/')
  115. # os.system('mkdir -p '+dest+'/output/')
  116. # # save the result of the last experiment
  117. # ensemble_member_folder = experiment_folder+'run_'+experiment+'_'+member_number
  118. # os.system('rsync -a '+ensemble_member_folder+'/output/* '+dest+'/output/')
  119. # os.system('rsync -a '+ensemble_member_folder+'/restart/kleiswi '+dest+'/restart/')
  120. # #os.system('rsync -a '+ensemble_member_folder+'/restart/*'+restart_year_to_save+' '+dest+'/restart/')
  121. # os.system('rsync -a '+ensemble_member_folder+'/restart/* '+dest+'/restart/')
  122. # # generating partial output files
  123. # dest_light = save_experiment_folder_light+'/'+str(new_experiment_index)
  124. # os.system('mkdir -p '+dest_light)
  125. # os.system('mkdir -p '+dest_light+'/restart/')
  126. # os.system('mkdir -p '+dest_light+'/output/')
  127. # os.system('rsync -a '+ensemble_member_folder+'/output/*.txt '+dest_light+'/output/')
  128. # os.system('rsync -a '+ensemble_member_folder+'/restart/kleiswi '+dest_light+'/restart/')
  129. # #os.system('rsync -a '+ensemble_member_folder+'/restart/*'+restart_year_to_save+' '+dest_light+'/restart/')
  130. # os.system('rsync -a '+ensemble_member_folder+'/restart/* '+dest_light+'/restart/')
  131. # nc_list = glob.glob(ensemble_member_folder+'/output/*PLA*.nc')
  132. # for infile in nc_list:
  133. # filename = infile.split('/')[-1]
  134. # os.system('nccopy -V '+','.join(PLASIM_VAR)+' '+infile+' '+dest_light+'/output/'+filename)
  135. # nc_list = glob.glob(ensemble_member_folder+'/output/*OCE*.nc')
  136. # for infile in nc_list:
  137. # filename = infile.split('/')[-1]
  138. # os.system('nccopy -V '+','.join(OCEAN_VAR)+' '+infile+' '+dest_light+'/output/'+filename)
  139. # nc_list = glob.glob(ensemble_member_folder+'/output/*LSG*.nc')
  140. # for infile in nc_list:
  141. # filename = infile.split('/')[-1]
  142. # os.system('nccopy -V '+','.join(LSG_VAR)+' '+infile+' '+dest_light+'/output/'+filename)
  143. #
  144. #
  145. #print('Backup of the previous run done !')
  146. #
  147. #if last_experiment_index > 0:
  148. # print('Creating the tar archive of the experiment run number '+str(last_experiment_index))
  149. # print('and saving it in ECFS temporary storage...')
  150. # j = last_experiment_index
  151. # yts = int(restart_year_to_save)
  152. #
  153. # ##
  154. # os.system('cd '+scratch_dir+'/tmp/ && tar -c -f '+scratch_dir+'/tmp/plasim_'+experiment+'_years_'+str(yts*(j-1)+1)+'to'+str(yts*j)+'.tar '+experiment)
  155. # ##
  156. #
  157. # #os.system('tar -c -f '+scratch_dir+'/tmp/plasim_'+experiment+'_years_'+str(yts*(j-1)+1)+'to'+str(yts*j)+'.tar '+scratch_dir+'/tmp/'+experiment)
  158. # os.system('rm -rf '+scratch_dir+'/tmp/'+experiment+'/*')
  159. # queue = subprocess.run(['emkdir', '-p', 'ectmp:'+'/'+user+'/'+basedir+'/'+experiment], check=True)
  160. # os.system('ecp -t '+scratch_dir+'/tmp/plasim_'+experiment+'_years_'+str(yts*(j-1)+1)+'to'+str(yts*j)+'.tar '+ 'ectmp:'+'/'+user+'/'+basedir+'/'+experiment+'/')
  161. # print('Backup and move to ecfs done !')
  162. #
  163. print('Starting the ensemble runs...')
  164. for i in range(1, ensemble_size+1):
  165. member_number = str(i).rjust(2, '0')
  166. job_name = 'plasim_'+experiment+'_'+member_number
  167. ensemble_member_folder = experiment_folder+'run_'+experiment+'_'+member_number
  168. os.system('qsub '+ensemble_member_folder+'/PBS_'+job_name)
  169. print("Experiment '"+experiment+"' ensemble restarted.")
  170. print('Check the status with: qstat -u '+user)