From 6c2e18a3bccde5241e0ad7e3252a9253aac01672 Mon Sep 17 00:00:00 2001 From: Robert Izzard <r.izzard@surrey.ac.uk> Date: Sat, 6 Nov 2021 23:18:15 +0000 Subject: [PATCH] still buggy - this is just to back up to the server --- binarycpython/utils/grid.py | 78 ++++++++------------ binarycpython/utils/grid_options_defaults.py | 2 +- 2 files changed, 33 insertions(+), 47 deletions(-) diff --git a/binarycpython/utils/grid.py b/binarycpython/utils/grid.py index 213a89aa7..3692971e4 100644 --- a/binarycpython/utils/grid.py +++ b/binarycpython/utils/grid.py @@ -36,6 +36,7 @@ import msgpack import multiprocessing import os import pathlib +import pprint # for debugging only import psutil import py_rinterpolate import re @@ -1069,21 +1070,24 @@ class Population: self._pre_run_cleanup() if self.grid_options["slurm"]>=1: - self.grid_options["symlink latest gridcode"] = False + self.grid_options["symlink_latest_gridcode"] = False if self.grid_options["condor"] >= 1: # Execute condor subroutines # self._condor_grid() raise ValueError("Condor evolution not available at this moment") + elif self.grid_options["slurm"] == 1: # Slurm setup grid self.slurm_grid() # and then exit + print("Slurm jobs launched : exiting") sys.exit() else: # Execute population evolution subroutines self._evolve_population() + print("do analytics") # Put all interesting stuff in a variable and output that afterwards, as analytics of the run. analytics_dict = { "population_name": self.grid_options["_population_id"], @@ -1115,7 +1119,8 @@ class Population: self.save_population_object() # if we're running a slurm grid, exit here - if self.grid_options["slurm"] >= 1: + # unless we're joining + if self.grid_options["slurm"] >= 1 and self.grid_options['evolution_type'] != 'join': sys.exit() ## @@ -1123,7 +1128,6 @@ class Population: # because that makes for easier control self._cleanup() - return analytics_dict def _evolve_population(self): @@ -1147,14 +1151,19 @@ class Population: # special cases if self.grid_options['evolution_type'] == 'join': joinfiles = self.joinfiles() - - if self.can_join(joinfiles): + joiningfile = self.slurmpath('joining') + if self.can_join(joinfiles,joiningfile): # join object files - print("can join : all tasks are finished") - self.join_from_files(joinfiles) + try: + pathlib.Path(joiningfile).touch(exist_ok=False) + print("can join : all tasks are finished") + self.join_from_files(joinfiles) + except: + pass else: print("cannot join : other tasks are not yet finished") - return + print("Finished this job : exiting") + sys.exit() ############################################################ # Evolve systems @@ -2820,7 +2829,7 @@ class Population: file.write(self.code_string) # perhaps create symlink - if self.grid_options["symlink latest gridcode"]: + if self.grid_options["symlink_latest_gridcode"] and self.grid_options["slurm"] == 0: global _count symlink = os.path.join( self.grid_options["tmp_dir"], "binary_c_grid-latest" + str(_count) @@ -5079,7 +5088,7 @@ eccentricity3=0 # make a list of directories, these contain the various slurm # output, status files, etc. dirs = [] - for dir in ['scripts','stdout','stderr','results','logs','status','joining']: + for dir in ['stdout','stderr','results','status']: dirs.append(self.slurmpath(dir)) # make the directories: we do not allow these to already exist @@ -5124,8 +5133,8 @@ eccentricity3=0 if self.grid_options['evolution_type'] == 'grid': # run a grid of stars only, leaving the results # in a file - print("Run grid") + # get number of cpu cores available to us ncpus = max(1,psutil.cpu_count(logical=True)) @@ -5138,39 +5147,7 @@ eccentricity3=0 return self.evolve() elif self.grid_options['evolution_type'] == 'join': - # join the data from multiple grid runs - - # TODO : fix this - joinfile = os.path.join(self.slurmpath(),'joining', self.grid_options['slurm_jobid']) - - if os.path.exists(joinfile): - print("Another process is already joining") - return - - for n in range(1,self.grid_options['slurm_njobs']+1): - results_dumpfile = os.path.join(slurmpath('status'),self.grid_options['slurm_jobid'] + '.' + n) - print("Check file {file} (jobid {jobid}, n {n}\n".format( - file=results_dumpfle, - jobid=self.grid_options['slurm_jobid'], - n=n)) - status = self.get_slurm_status(jobid=self.grid_options['slurm_jobid'], - jobarrayindex=n) - if status != "finished": - print("... is not finished") - return - else: - print("... is finished") - - # attempt to ~atomically create the joinfile - # https://stackoverflow.com/questions/33223564/atomically-creating-a-file-if-it-doesnt-exist-in-python - try: - pathlib.Path(joinfile).touch(exist_ok=False) - except: - # already joining - return - - self.grid_options['rungrid'] = 0 - + # should not happen! return else: # setup and launch slurm jobs @@ -5322,6 +5299,8 @@ eccentricity3=0 print("Save pickle to ",filename) print("pop is ",self.grid_options["_population_id"]) print("probtot ",object.grid_options['_probtot']) + #print("grid_ensemble_results",pprint.pprint(object.grid_ensemble_results, sort_dicts=False)) + # remove shared memory shared_memory = object.shared_memory object.shared_memory = None @@ -5350,7 +5329,6 @@ eccentricity3=0 return None else: obj = compress_pickle.load(filename) - print("loaded obj",obj) return obj def merge_grid_object_results(self,refpop,newpop): @@ -5370,8 +5348,13 @@ eccentricity3=0 print("merge dicts") print("left: ",refpop.grid_results) print("right:",newpop.grid_results) + + # combine data refpop.grid_results = merge_dicts(refpop.grid_results, newpop.grid_results) + refpop.grid_ensemble_results = merge_dicts(refpop.grid_ensemble_results, + newpop.grid_ensemble_results) + print("probs left ",refpop.grid_options["_probtot"],"right",newpop.grid_options["_probtot"]) for key in ["_probtot"]: refpop.grid_options[key] += newpop.grid_options[key] @@ -5409,9 +5392,12 @@ eccentricity3=0 file) print("done join from files") - def can_join(self,joinfiles): + def can_join(self,joinfiles,joiningfile): # check the joinfiles to make sure they all exist # and their .saved equivalents also exist + + if os.path.exists(joiningfile): + return False for file in joinfiles: print("check for ",file) if os.path.exists(file) == False: diff --git a/binarycpython/utils/grid_options_defaults.py b/binarycpython/utils/grid_options_defaults.py index ddc07e224..ed37dc8df 100644 --- a/binarycpython/utils/grid_options_defaults.py +++ b/binarycpython/utils/grid_options_defaults.py @@ -128,7 +128,7 @@ grid_options_defaults_dict = { ## Grid type evolution "_grid_variables": {}, # grid variables "gridcode_filename": None, # filename of gridcode - "symlink latest gridcode": True, # symlink to latest gridcode + "symlink_latest_gridcode": True, # symlink to latest gridcode "save_population_object" : None, # filename to which we should save a pickled grid object as the final thing we do 'joinlist' : None, ## Monte carlo type evolution -- GitLab