diff --git a/binarycpython/utils/HPC.py b/binarycpython/utils/HPC.py index 83c41af8c65917cd48c2392a7708d58c443d5690..f91807df133c41928b91f34a6e66622cab8679f9 100644 --- a/binarycpython/utils/HPC.py +++ b/binarycpython/utils/HPC.py @@ -8,10 +8,11 @@ other modules can use a single API rather than have to choose to use the Slurm or Condor API. """ - +import datetime import glob import os import pathlib +import time from binarycpython.utils.slurm import slurm from binarycpython.utils.condor import condor @@ -22,6 +23,69 @@ class HPC(condor,slurm): # don't do anything: we just inherit from this class return + def HPC_njobs(self): + """ + Function to return the number of jobs this HPC jobs will use, as an int. + """ + if self.grid_options['slurm'] > 0: + n = self.grid_options['slurm_njobs'] + elif self.grid_options['condor'] > 0: + n = self.grid_options['condor_njobs'] + else: + n = None + return int(n) + + def HPC_make_joiningfile(self, + id=None, + dir=None, + n=None, + overwrite=False, + error_on_overwrite=False): + """ + Function to make the joiningfile file that contains the filenames of results from each job. When all these exist, we can join. + + Note: you normally don't need to set any of the option arguments. + + Args: + dir : the HPC directory, or self.HPC_dir() if None (default=None). + n : the number of jobs, or self.HPC_njobs() if None (default=None). + id : the job ID number, or self.HPC_jobID_tuple()[0] if None (default=None). + overwrite : if True, overwrite an existing joiningfile (default=False) + error_on_overwite : if True, and we try to overwrite, issue and error and exit (default=False) + + """ + + # defaults + if dir is None: + dir = self.HPC_dir() + if n is None: + n = self.HPC_njobs() + if id is None: + id = self.HPC_jobID_tuple()[0] + + # make path and filename + prefix = os.path.join(dir,'results') + file = os.path.join(prefix,id + '.all') + + # check the joiningfile doesn't exist + if not overwrite and os.path.isfile(file): + print("Cannot make joiningfile at {file} because it already exists.".format(file=file)) + # perhaps cause an error if it does + if error_on_overwrite: + self.exit(code=1) + + # write the joiningfile + print("Making joiningfile at {file} with range 0 to {n}".format( + file=file, + n=n + )) + with open(file,"w",encoding="utf-8") as f: + for i in range(0,n): + f.write(os.path.join(prefix, + "{i}.gz\n".format(i=i))) + f.close() + return + def HPC_joinfiles(self,joinlist=None): """ Function to load in the joinlist to a list and return it. @@ -45,8 +109,8 @@ class HPC(condor,slurm): if self.grid_options['HPC_prepend_dir_to_joinlist'] is True: list = [os.path.join(prefix,file) for file in list] - except: - print("Failed to open joinlist at {list}".format(list=joinlist)) + except Exception as e: + print("Failed to open joinlist at {list} : {e}".format(list=joinlist,e=e)) self.exit(code=1) return list @@ -65,10 +129,11 @@ class HPC(condor,slurm): Check the joinfiles to make sure they all exist and their .saved equivalents also exist """ + print("HPC check if we can join at {}",self.now()) if self.grid_options['HPC_force_join'] == 0 and \ os.path.exists(joiningfile): if vb: - print("cannot join: joiningfile exists at {}".format(joiningfile)) + print("cannot join : joiningfile exists at {} (check 1)".format(joiningfile)) return False elif vb: print("joiningfile (at {}) does not exist".format(joiningfile)) @@ -77,14 +142,14 @@ class HPC(condor,slurm): print("check for {}".format(file)) if os.path.exists(file) == False: if vb: - print("cannot join: {} does not exist".format(file)) + print("cannot join : file \"{}\" does not exist".format(file)) return False savedfile = file + '.saved' if vb: print("check for {}".format(savedfile)) if os.path.exists(savedfile) == False: if vb: - print("cannot join: {} does not exist".format(savedfile)) + print("cannot join : savedfile \"{}\" does not exist".format(savedfile)) return False # found both files @@ -97,17 +162,17 @@ class HPC(condor,slurm): x = True elif os.path.exists(joiningfile): if vb: - print("cannot join: joiningfile exists at {}".format(joiningfile)) + print("cannot join: joiningfile exists at {} (check 2)".format(joiningfile)) x = False elif vb: - print("joiningfile does not exist : can join") + print("joiningfile at {} does not exist : can join".format(joiningfile)) x = True if vb: print("returning {} from HPC_can_join()".format(x)) return x - def HPCjob(self): + def HPC_job(self): """ Function to return True if we're running an HPC (Slurm or Condor) job, False otherwise. """ @@ -117,8 +182,21 @@ class HPC(condor,slurm): else: x = False return x - - def HPCjobtype(self): + + def HPC_job_task(self): + """ + Function to return the HPC task number, which is 1 when setting + up and running the scripts, 2 when joining data. + """ + if self.grid_options['slurm'] > 0: + x = self.grid_options['slurm'] + elif self.grid_options['condor'] > 0: + x = self.grid_options['condor'] + else: + x = 0 + return x + + def HPC_job_type(self): """ Function to return a string telling us the type of an HPC job, i.e. "slurm", "condor" or "None". @@ -131,7 +209,7 @@ class HPC(condor,slurm): type = "None" return type - def HPCjobID(self): + def HPC_jobID(self): """ Function to return an HPC (Slurm or Condor) job id in the form x.y. Returns None if not an HPC job. """ @@ -144,16 +222,22 @@ class HPC(condor,slurm): id = None return id - def HPCjobIDtuple(self): + def HPC_jobID_tuple(self): """ - Return the job ID as a tuple, (x,y), or (None,None) on failure + Return the job ID as a tuple of ints, (x,y), or (None,None) on failure """ - id = self.HPCjobID() - if id: - t = tuple(id.split('.')) - else: - t = (None,None) - return t + id = self.HPC_jobID() + + if id.startswith('None'): + t = [None,None] + elif self.HPC_job(): + print("JOBID",id) + t = id.split('.') + if not t[0]: + t[0] = None + if not t[1]: + t[1] = None + return tuple(t) def HPC_set_status(self,string): """ @@ -202,18 +286,33 @@ class HPC(condor,slurm): dirs = [] return dirs - def HPCgrid(self): + + def HPC_grid(self,makejoiningfile=True): """ Function to call the appropriate HPC grid function - (e.g. Slurm or Condor) and return what it returns + (e.g. Slurm or Condor) and return what it returns. + + Args: + makejoiningfile : if True, and we're the first job with self.HPC_task() == 2, we build the joiningfile. (default=True) This option exists in case you don't want to overwrite an existing joiningfile, or want to build it in another way (e.g. in the HPC scripts). """ + id = self.HPC_jobID_tuple()[0] + + + if makejoiningfile and \ + self.HPC_job_task() == 2 and \ + id is not None: + print("Making joiningfile from HPC_grid().") + self.HPC_make_joiningfile() + if self.grid_options['slurm'] > 0: - return self.slurm_grid() + x = self.slurm_grid() elif self.grid_options['condor'] > 0: - return self.condor_grid() + x = self.condor_grid() else: - return None # should not happen + x =None # should not happen + return x + def HPC_check_requirements(self): """ Function to check HPC option requirements have been met. Returns a tuple: (True,"") if all is ok, (False,<warning string>) otherwise. @@ -265,10 +364,10 @@ class HPC(condor,slurm): """ Set grid_options['restore_from_snapshot_file'] so that we restore data from existing an HPC run if self.grid_options[type+'_restart_dir'], where type is "slurm" or "condor", - is provided, otherwise do nothing. This only works if grid_options[type] == 2, which is + is provided, otherwise do nothing. This only works if grid_options[type] == self.HPC_job_task() == 2, which is the run-grid stage of the process. """ - type = self.HPCjobtype() + type = self.HPC_job_type() if type is None: return key = type + '_restart_dir' @@ -279,10 +378,10 @@ class HPC(condor,slurm): if dir is None: return # get HPC job index - index = self.HPCjobIDtuple()[1] + index = self.HPC_jobID_tuple()[1] if index is None: return - if self.grid_options[type] == 2: + if self.HPC_job_task() == 2: # (same as) self.grid_options[type] == 2: old_id = self.HPC_id_from_dir(dir) print("Restart from dir {dir} which was has (old) ID {old_id}, we are job index {index}".format( dir=dir, @@ -292,11 +391,6 @@ class HPC(condor,slurm): # check status: if "finished", we don't have to do anything status = self.HPC_get_status(dir=dir) - #file = os.path.join(dir, - # 'status', - # "{id}.{index}".format(id=old_id, - # index=index)) - #status = open(file,encoding='utf-8').read() if status == 'finished': print("Status is finished, cannot and do not need to restart.") @@ -334,15 +428,16 @@ class HPC(condor,slurm): else: # our job has finished joinfiles = self.HPC_joinfiles() - joiningfile = self.HPCpath('joining') - print("Joinfiles: ",joinfiles) - print("Joingingfiles: ",joiningfile) + joiningfile = self.HPC_path('joining') + print("Joinfiles : ",joinfiles) + print("Joingingfiles : ",joiningfile) if self.HPC_can_join(joinfiles,joiningfile,vb=True): # join object files print("We can join") try: # touch joiningfile if self.grid_options['HPC_force_join'] == 0: + print("Making joiningfile at {}",joiningfile) pathlib.Path(joiningfile).touch(exist_ok=False) try: print("Calling HPC_join_from_files()") @@ -353,15 +448,15 @@ class HPC(condor,slurm): # values we just loaded self.grid_options['do_analytics'] = False return - except: - print("pass") + except Exception as e: + print("pass {}",e) pass else: print("cannot join : other tasks are not yet finished\n") print("Finished this job : exiting") self.exit(code=1) - def HPCpath(self,path): + def HPC_path(self,path): """ Function to file the filename of this HPC job's file at path. """ @@ -377,10 +472,10 @@ class HPC(condor,slurm): """ Function to return an HPC job's snapshot filename. """ - if self.HPCjob(): + if self.HPC_job(): file = os.path.join(self.HPC_dir, 'snapshots', - self.HPCjobID() + '.gz') + self.HPC_jobID() + '.gz') else: file = None return file diff --git a/binarycpython/utils/condor.py b/binarycpython/utils/condor.py index 4f1d6eb5b8483a4c7f6e7582a555d5f73ab68fa5..85e621210f460cae0b0a43686fcfa9536eb0b2dd 100644 --- a/binarycpython/utils/condor.py +++ b/binarycpython/utils/condor.py @@ -283,7 +283,7 @@ export BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME=`{date}` echo \"running\" > {condor_dir}/status/$ClusterID.$ProcessID # make list of files which is checked for joining -echo {condor_dir}/results/$ClusterID.$Process.gz >> {condor_dir}/results/$ClusterID.all +# echo {condor_dir}/results/$ClusterID.$Process.gz >> {condor_dir}/results/$ClusterID.all # run grid of stars and, if this returns 0, set status to finished {grid_command} condor=2 evolution_type=grid condor_ClusterID=$ClusterID condor_Process=$Process save_population_object={condor_dir}/results/$ClusterID.$Process.gz && echo -n \"finished\" > {condor_dir}/status/$ClusterID.$ProcessID && echo """.format( diff --git a/binarycpython/utils/dataIO.py b/binarycpython/utils/dataIO.py index 88396d84ff26e75b08f60f7da3e52f182ef4b01c..6bf7f703fe21d8839e0e2c359873b1620dd1a96b 100644 --- a/binarycpython/utils/dataIO.py +++ b/binarycpython/utils/dataIO.py @@ -92,7 +92,7 @@ class dataIO(): object.grid_ensemble_results["metadata"] = {} # add datestamp - object.grid_ensemble_results["metadata"]['save_population_time'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S") + object.grid_ensemble_results["metadata"]['save_population_time'] = self.now() # add extra metadata object.add_system_metadata() @@ -230,7 +230,7 @@ class dataIO(): """ Automatically choose the snapshot filename. """ - if self.HPCjob(): + if self.HPC_job(): return self.HPC_snapshot_filename() else: file = os.path.join(self.grid_options['tmp_dir'], @@ -485,5 +485,5 @@ class dataIO(): f.close() # custom logging functions for HPC jobs - if self.HPCjob(): + if self.HPC_job(): self.HPC_set_status(string) diff --git a/binarycpython/utils/grid.py b/binarycpython/utils/grid.py index 5b6b1a6bbfff541e8d2ab100e5023fe430e1a63e..bcd8861a76fec580071f7ed0e60e802fe11f1f7d 100644 --- a/binarycpython/utils/grid.py +++ b/binarycpython/utils/grid.py @@ -212,10 +212,10 @@ class Population(analytics, Function to return the job ID number of this process Normal processes return their process ID (PID) - HPC processes return whatever HPCjobID() gives. + HPC processes return whatever HPC_jobID() gives. """ - if self.HPCjob(): - jobID = self.HPCjobID() + if self.HPC_job(): + jobID = self.HPC_jobID() else: jobID = "{}".format(self.process_ID) return jobID @@ -395,12 +395,13 @@ class Population(analytics, try: value = type(old_value)(value) self.verbose_print("Success!", self.grid_options["verbosity"], 2) - except: - print("Failed to convert {param} value with type {type}: old_value is '{old}', new value is '{new}'".format( + except Exception as e: + print("Failed to convert {param} value with type {type}: old_value is '{old}', new value is '{new}', {e}".format( param=parameter, old=old_value, type=type(old_value), - new=split[1] + new=split[1], + e=e )) self.exit(code=1) @@ -581,9 +582,7 @@ class Population(analytics, if self.custom_options.get("data_dir", None): if not self.custom_options.get("base_filename", None): base_name = "simulation_{}".format( - datetime.datetime.strftime( - datetime.datetime.now(), "%Y%m%d_%H%M%S" - ) + self.now(style="nospace") ) else: base_name = os.path.splitext(self.custom_options["base_filename"])[ @@ -721,7 +720,7 @@ class Population(analytics, self.exit(code=1) # check any HPC requirements are met - if self.HPCjob() and not self.HPC_check_requirements()[0]: + if self.HPC_job() and not self.HPC_check_requirements()[0]: print(self.HPC_check_requirements()[1]) self.exit(code=1) @@ -790,10 +789,10 @@ class Population(analytics, # Just to make sure we don't have stuff from a previous run hanging around self._pre_run_setup() - if self.HPCjob(): + if self.HPC_job(): # run HPC grid: if this returns True, then exit immediately self.grid_options["symlink_latest_gridcode"] = False - if self.HPCgrid(): + if self.HPC_grid(): self.exit(code=0) if self.grid_options['evolution_type'] == 'join': @@ -824,7 +823,7 @@ class Population(analytics, # if we're running an HPC grid, exit here # unless we're joining - if self.HPCgrid() and \ + if self.HPC_grid() and \ self.grid_options['evolution_type'] != 'join': self.exit() @@ -2250,3 +2249,24 @@ class Population(analytics, self.grid_options["verbosity"], 3, ) + + def now(self,style=None): + """ + convenience function to return a string of the current time, + using the format "%m/%d/%Y %H:%M:%S" + + Args: + style : if "nospace" then return the date/time with the format + "%Y%m%d_%H%M%S" + """ + if style == 'nospace': + return datetime.datetime.strftime( + datetime.datetime.now(), + "%Y%m%d_%H%M%S" + ) + else: + return datetime.datetime.strftime( + datetime.datetime.now(), + "%m/%d/%Y %H:%M:%S" + ) + diff --git a/binarycpython/utils/grid_options_defaults.py b/binarycpython/utils/grid_options_defaults.py index e1cc9a492211073206768481c22fec886e6516e5..cdca850744d153e40e6319994a0fbf7763880af7 100644 --- a/binarycpython/utils/grid_options_defaults.py +++ b/binarycpython/utils/grid_options_defaults.py @@ -65,7 +65,7 @@ class grid_options_defaults(): "original_command_line": os.getenv('BINARY_C_PYTHON_ORIGINAL_CMD_LINE'), "working_diretory": os.getcwd(), "original_working_diretory": os.getenv('BINARY_C_PYTHON_ORIGINAL_WD'), - "start_time": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), + "start_time": self.now(), "original_submission_time" : os.getenv('BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME'), ########################## diff --git a/binarycpython/utils/gridcode.py b/binarycpython/utils/gridcode.py index 1ec3fe9f895f9a9e54d9cf1a225c6d7e377130fc..03c9e9ff8510112d73b4e6709b39c92baf031ae8 100644 --- a/binarycpython/utils/gridcode.py +++ b/binarycpython/utils/gridcode.py @@ -26,7 +26,7 @@ class gridcode(): """ Returns a filename for the gridcode. """ - if self.HPCjob(): + if self.HPC_job(): filename = os.path.join( self.grid_options["tmp_dir"], "binary_c_grid_{population_id}.{jobid}.py".format( @@ -614,7 +614,7 @@ class gridcode(): file.write(self.code_string) # perhaps create symlink - if not self.HPCjob() and self.grid_options["symlink_latest_gridcode"]: + if not self.HPC_job() and self.grid_options["symlink_latest_gridcode"]: global _count symlink = os.path.join( self.grid_options["tmp_dir"], "binary_c_grid-latest" + str(_count) diff --git a/binarycpython/utils/metadata.py b/binarycpython/utils/metadata.py index 45e6cbf7c56a3f6951953e245a199fe2c591a6df..e35bdb995f71d8a1aeef186bcc09e05cb33d698c 100644 --- a/binarycpython/utils/metadata.py +++ b/binarycpython/utils/metadata.py @@ -28,7 +28,7 @@ class metadata(): self.grid_ensemble_results["metadata"] = {} # add date - self.grid_ensemble_results["metadata"]['date'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S") + self.grid_ensemble_results["metadata"]['date'] = self.now() # add platform and build information try: diff --git a/binarycpython/utils/slurm.py b/binarycpython/utils/slurm.py index cd03e4166f3b916484e150ee9f469cecc3cc8b8f..8f20683937aaa790bf3af8b725fa70ca018cebe8 100644 --- a/binarycpython/utils/slurm.py +++ b/binarycpython/utils/slurm.py @@ -299,7 +299,7 @@ export BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME=`{date}` echo \"running\" > {slurm_dir}/status/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID # make list of files which is checked for joining -echo {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz >> {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.all +# echo {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz >> {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.all # run grid of stars and, if this returns 0, set status to finished {grid_command} slurm=2 evolution_type=grid slurm_jobid=$SLURM_ARRAY_JOB_ID slurm_jobarrayindex=$SLURM_ARRAY_TASK_ID save_population_object={slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz && echo -n \"finished\" > {slurm_dir}/status/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID && echo """.format(