From 99a016109dfdf39846e07ed4e3b82118fc825b75 Mon Sep 17 00:00:00 2001 From: Robert Izzard <r.izzard@surrey.ac.uk> Date: Sat, 13 Nov 2021 12:22:37 +0000 Subject: [PATCH] fix issues with CPU_time calculation cleaned up the evolve function so the option to join is selected before it is called, and this now has its own function --- binarycpython/utils/analytics.py | 33 +++++++- binarycpython/utils/dataIO.py | 1 + binarycpython/utils/grid.py | 125 +++++++++++++++---------------- binarycpython/utils/metadata.py | 1 - binarycpython/utils/version.py | 26 +++---- 5 files changed, 106 insertions(+), 80 deletions(-) diff --git a/binarycpython/utils/analytics.py b/binarycpython/utils/analytics.py index 4e421d1a3..43c557fd9 100644 --- a/binarycpython/utils/analytics.py +++ b/binarycpython/utils/analytics.py @@ -50,22 +50,49 @@ class analytics(): return analytics_dict - def time_elapsed(self): + def set_time(self,when): + """ + Function to set the timestamp at when, where when is 'start' or 'end'. + + If when == end, we also calculate the time elapsed. + """ + self.grid_options['_' + when + '_time_evolution'] = time.time() + if when == 'end': + self.grid_options["_time_elapsed"] = self.time_elapsed(force=True) + + def time_elapsed(self,force=False): """ Function to return how long a population object has been running. + + We return the cached value if it's available, and calculate + the time elapsed if otherwise or if force is True """ for x in ["_start_time_evolution","_end_time_evolution"]: if not self.grid_options[x]: self.grid_options[x] = time.time() - return self.grid_options["_end_time_evolution"] - self.grid_options["_start_time_evolution"] + print("{} missing : {}",x,self.grid_options[x]) + + if force or "_time_elapsed" not in self.grid_options: + self.grid_options["_time_elapsed"] = self.grid_options["_end_time_evolution"] - self.grid_options["_start_time_evolution"] + print("set time elapsed = {} - {} = {}".format( + self.grid_options["_end_time_evolution"], + self.grid_options["_start_time_evolution"], + self.grid_options["_time_elapsed"], + )) + + return self.grid_options["_time_elapsed"] def CPU_time(self): """ Function to return how much CPU time we've used """ - dt = self.time_elapsed() + dt = self.grid_options["_time_elapsed"] try: ncpus = self.grid_options['num_processes'] except: ncpus = 1 + print("CPU time : dt={} n={} -> {}".format( + dt, + ncpus, + dt*ncpus)) return dt * ncpus diff --git a/binarycpython/utils/dataIO.py b/binarycpython/utils/dataIO.py index fe6cbf065..39ca476e5 100644 --- a/binarycpython/utils/dataIO.py +++ b/binarycpython/utils/dataIO.py @@ -4,6 +4,7 @@ import bz2 import compress_pickle +import copy import datetime import gzip import json diff --git a/binarycpython/utils/grid.py b/binarycpython/utils/grid.py index 1008893d5..c383e7dca 100644 --- a/binarycpython/utils/grid.py +++ b/binarycpython/utils/grid.py @@ -830,7 +830,7 @@ class Population(analytics, # Just to make sure we don't have stuff from a previous run hanging around self._pre_run_setup() - if self.grid_options["slurm"]>=1: + if self.grid_options["slurm"]>=1 or self.grid_options["condor"]>=1: self.grid_options["symlink_latest_gridcode"] = False if self.grid_options["condor"] >= 1: @@ -841,8 +841,13 @@ class Population(analytics, elif self.grid_options["slurm"] == 1: # Slurm setup grid then exit self.slurm_grid() - self.exit(code=0) + + elif self.grid_options['evolution_type'] == 'join': + # join previously calculated data and return immediately + self.join_previous() + return + else: # Execute population evolution subroutines self._evolve_population() @@ -889,50 +894,16 @@ class Population(analytics, - TODO: include options for different ways of generating a population here. (i.e. MC or source file) """ - ## + ############################################################ # Prepare code/initialise grid. # set custom logging, set up store_memaddr, build grid code. dry run grid code. self._setup() - # special cases - if self.grid_options['evolution_type'] == 'join': - - # check that our job has finished - status = self.get_slurm_status() - - if status != "finished": - # job did not finish : save a snapshot - print("This job did not finish (status is {status}) : cannot join".format(status=status)) - self.exit(code=1) - else: - # our job has finished - joinfiles = self.joinfiles() - joiningfile = self.slurmpath('joining') - if self.can_join(joinfiles,joiningfile): - # join object files - try: - pathlib.Path(joiningfile).touch(exist_ok=False) - print("can join : all tasks are finished") - try: - self.join_from_files(self,joinfiles) - except Exception as e: - print("Join gave exception",e) - # disable analytics calculations : use the - # values we just loaded - self.grid_options['do_analytics'] = False - return - except: - pass - else: - print("cannot join : other tasks are not yet finished\n") - print("Finished this job : exiting") - self.exit(code=1) - ############################################################ # Evolve systems - elif ( - self.grid_options["evolution_type"] - in self.grid_options["_evolution_type_options"] + self.set_time("start") + if ( + self.grid_options["evolution_type"] in self.grid_options["_evolution_type_options"] ): if self.grid_options["evolution_type"] in ["grid", "custom_generator"]: self._evolve_population_grid() @@ -946,32 +917,33 @@ class Population(analytics, self.grid_options["_evolution_type_options"] ) ) + self.set_time("end") - # finished! - self.grid_options["_end_time_evolution"] = time.time() - + ############################################################ # Log and print some information - dtsecs = self.time_elapsed() - string1 = "Population-{} finished!\nThe total probability is {:g}.".format( self.grid_options["_population_id"], self.grid_options["_probtot"] ) - string2 = "It took a total of {dtsecs} to run {starcount} systems on {ncores} cores\n = {totaldtsecs} of CPU time.\nMaximum memory use {memuse:.3f} MB".format( - dtsecs=timedelta(dtsecs), + string2 = "It took a total of {dtsecs} to run {starcount} systems on {ncores} cores\n = {CPUtime} of CPU time.\nMaximum memory use {memuse:.3f} MB".format( + dtsecs=timedelta(self.grid_options["_time_elapsed"]), starcount=self.grid_options["_count"], # not _total_count! we may have ended the run early... ncores=self.grid_options["num_processes"], - totaldtsecs=timedelta(dtsecs * self.grid_options["num_processes"]), + CPUtime=timedelta(self.CPU_time()), memuse=sum(self.shared_memory["max_memory_use_per_thread"]), ) - + ############################################################ # add warning about a grid that was killed + ############################################################ if self.was_killed(): string2 += "\n>>> Grid was killed <<<" self.set_status("killed") self.verbose_print(self._boxed(string1, string2), self.grid_options["verbosity"], 0) + ############################################################ + # handle errors + ############################################################ if self.grid_options["_errors_found"]: # Some information afterwards self.verbose_print( @@ -1008,6 +980,8 @@ class Population(analytics, 0, ) + return + def _system_queue_filler(self, job_queue, num_processes): """ Function that is responsible for keeping the queue filled. @@ -2013,9 +1987,6 @@ class Population(analytics, self.grid_options[ "_probtot" ] = 0 # To make sure that the values are reset. TODO: fix this in a cleaner way - self.grid_options[ - "_start_time_evolution" - ] = time.time() # Setting start time of grid # # Making sure the loaded grid code isn't lingering in the main PID # self._generate_grid_code(dry_run=False) @@ -2061,9 +2032,6 @@ class Population(analytics, self.grid_options[ "_probtot" ] = 0 # To make sure that the values are reset. TODO: fix this in a cleaner way - self.grid_options[ - "_start_time_evolution" - ] = time.time() # Setting start time of grid # # TODO: fix this function @@ -2074,9 +2042,6 @@ class Population(analytics, self.grid_options[ "_probtot" ] = 0 # To make sure that the values are reset. TODO: fix this in a cleaner way - self.grid_options[ - "_start_time_evolution" - ] = time.time() # Setting start time of grid def _cleanup(self): """ @@ -2253,7 +2218,7 @@ class Population(analytics, if binary_c_output: if (binary_c_output.splitlines()[0].startswith("SYSTEM_ERROR")) or ( - binary_c_output.splitlines()[-1].startswith("SYSTEM_ERROR") + binary_c_output.splitlines()[-1].startswith("SYSTEM_ERROR") ): self.verbose_print( "FAILING SYSTEM FOUND", @@ -2276,8 +2241,8 @@ class Population(analytics, ) if ( - not error_code - in self.grid_options["_failed_systems_error_codes"] + not error_code + in self.grid_options["_failed_systems_error_codes"] ): self.grid_options["_failed_systems_error_codes"].append( error_code @@ -2291,8 +2256,8 @@ class Population(analytics, # Check if we have exceeded the number of errors if ( - self.grid_options["_failed_count"] - > self.grid_options["failed_systems_threshold"] + self.grid_options["_failed_count"] + > self.grid_options["failed_systems_threshold"] ): if not self.grid_options["_errors_exceeded"]: self.verbose_print( @@ -2328,3 +2293,37 @@ class Population(analytics, self.grid_options["verbosity"], 3, ) + + def join_previous(self): + """ + Function to join previously generated datasets. + """ + # check that our job has finished + status = self.get_slurm_status() + + if status != "finished": + # job did not finish : save a snapshot + print("This job did not finish (status is {status}) : cannot join".format(status=status)) + else: + # our job has finished + joinfiles = self.joinfiles() + joiningfile = self.slurmpath('joining') + if self.can_join(joinfiles,joiningfile): + # join object files + try: + pathlib.Path(joiningfile).touch(exist_ok=False) + print("can join : all tasks are finished") + try: + self.join_from_files(self,joinfiles) + except Exception as e: + print("Join gave exception",e) + # disable analytics calculations : use the + # values we just loaded + self.grid_options['do_analytics'] = False + return + except: + pass + else: + print("cannot join : other tasks are not yet finished\n") + print("Finished this job : exiting") + self.exit(code=1) diff --git a/binarycpython/utils/metadata.py b/binarycpython/utils/metadata.py index afea81737..45e6cbf7c 100644 --- a/binarycpython/utils/metadata.py +++ b/binarycpython/utils/metadata.py @@ -101,7 +101,6 @@ class metadata(): self.grid_options[x] = combined_output_dict[x] self.grid_options["_failed_systems_error_codes"] = list(set(combined_output_dict["_failed_systems_error_codes"])) - def _metadata_keylist(self): return ["_failed_count", "_failed_prob", diff --git a/binarycpython/utils/version.py b/binarycpython/utils/version.py index 52f63979a..d7de3375e 100644 --- a/binarycpython/utils/version.py +++ b/binarycpython/utils/version.py @@ -120,7 +120,7 @@ class version(): # Isotopes: # Split off isotopes = {el for el in cleaned if el.startswith("Isotope ")} - cleaned = cleaned - isotopes + cleaned -= isotopes isotope_dict = {} for el in isotopes: @@ -157,7 +157,7 @@ class version(): # Arg pairs: # Split off argpairs = set([el for el in cleaned if el.startswith("ArgPair")]) - cleaned = cleaned - argpairs + cleaned -= argpairs argpair_dict = {} for el in sorted(argpairs): @@ -174,7 +174,7 @@ class version(): # ensembles: # Split off ensembles = {el for el in cleaned if el.startswith("Ensemble")} - cleaned = cleaned - ensembles + cleaned -= ensembles ensemble_dict = {} ensemble_filter_dict = {} @@ -197,7 +197,7 @@ class version(): # macros: # Split off macros = {el for el in cleaned if el.startswith("macroxyz")} - cleaned = cleaned - macros + cleaned -= macros param_type_dict = { "STRING": str, @@ -256,7 +256,7 @@ class version(): # Elements: # Split off: elements = {el for el in cleaned if el.startswith("Element")} - cleaned = cleaned - elements + cleaned -= elements # Fill dict: elements_dict = {} @@ -285,7 +285,7 @@ class version(): # dt_limits: # split off dt_limits = {el for el in cleaned if el.startswith("DTlimit")} - cleaned = cleaned - dt_limits + cleaned -= dt_limits # Fill dict dt_limits_dict = {} @@ -364,32 +364,32 @@ class version(): misc_dict["git_revision"] = ( git_revision[0].split("git revision ")[-1].replace('"', "") ) - cleaned = cleaned - set(git_revision) + cleaned -= set(git_revision) # filter out git url git_url = [el for el in cleaned if el.startswith("git URL")] misc_dict["git_url"] = git_url[0].split("git URL ")[-1].replace('"', "") - cleaned = cleaned - set(git_url) + cleaned -= set(git_url) # filter out version version = [el for el in cleaned if el.startswith("Version")] misc_dict["version"] = str(version[0].split("Version ")[-1]) - cleaned = cleaned - set(version) + cleaned -= set(version) git_branch = [el for el in cleaned if el.startswith("git branch")] misc_dict["git_branch"] = git_branch[0].split("git branch ")[-1].replace('"', "") - cleaned = cleaned - set(git_branch) + cleaned -= set(git_branch) build = [el for el in cleaned if el.startswith("Build")] misc_dict["build"] = build[0].split("Build: ")[-1].replace('"', "") - cleaned = cleaned - set(build) + cleaned -= set(build) email = [el for el in cleaned if el.startswith("Email")] misc_dict["email"] = email[0].split("Email ")[-1].split(",") - cleaned = cleaned - set(email) + cleaned -= set(email) other_items = set([el for el in cleaned if " is " in el]) - cleaned = cleaned - other_items + cleaned -= other_items for el in other_items: split = el.split(" is ") -- GitLab