diff --git a/binarycpython/utils/grid.py b/binarycpython/utils/grid.py index fcaa4bf66e9cb2d5af3f9f7a9ca64cc42c40c6b0..2618e57da5f27a1702bb40ac5b3d7ff434b40782 100644 --- a/binarycpython/utils/grid.py +++ b/binarycpython/utils/grid.py @@ -1092,19 +1092,12 @@ class Population( self.grid_options["verbosity"], 0, ) - # Some information afterwards - self.verbose_print( - "The full argline commands for {} these systems have been written to {}".format( - "ALL" - if not self.grid_options["_errors_exceeded"] - else "SOME (only the first ones, as there were too many to log all of them)", - os.path.join( - self.grid_options["tmp_dir"], - "failed_systems_{}_X.txt".format( - self.grid_options["_population_id"] - ), - ), - ), + # Some information on where we logged the systems + if self.grid_options['log_failed_systems'] == True and \ + self.grid_options['log_failed_systems_dir'] != None: + + self.verbose_print( + "The full failed arglines have been written to {self.grid_options['log_failed_systems_dir']}/process_{self.jobID()}.txt", self.grid_options["verbosity"], 0, ) @@ -1202,6 +1195,7 @@ class Population( 3, ) + self.grid_options["_queue_done"] = True # Send closing signal to workers. When they receive this they will terminate @@ -1379,7 +1373,7 @@ class Population( if self.grid_options["save_snapshots"] and self.grid_options["_killed"]: self.custom_options["save_snapshot"] = True - def _evolve_system_mp(self, full_system_dict): + def _evolve_system_mp(self, system_number, full_system_dict): """ Function that the multiprocessing evolution method calls to evolve a system @@ -1410,7 +1404,7 @@ class Population( ) # Check for errors - _ = self._check_binary_c_error(out, full_system_dict) + _ = self._check_binary_c_error(system_number, out, full_system_dict) # Have some user-defined function do stuff with the data. if self.grid_options["parse_function"]: @@ -1735,7 +1729,8 @@ class Population( if run_system: # Evolve the system - self._evolve_system_mp(full_system_dict) + self._evolve_system_mp(system_number, + full_system_dict) end_runtime_binary_c = time.time() @@ -2403,7 +2398,7 @@ class Population( return killed - def _check_binary_c_error(self, binary_c_output, system_dict): + def _check_binary_c_error(self, system_number, binary_c_output, system_dict): """ Function to check whether binary_c throws an error and handle accordingly. """ @@ -2423,6 +2418,22 @@ class Population( self.grid_options["_failed_count"] += 1 self.grid_options["_errors_found"] = True + try: + error_code = int( + binary_c_output.splitlines()[0] + .split("with error code")[-1] + .split(":")[0] + .strip() + ) + self.verbose_print(f"Have error code {error_code}", + self.grid_options["verbosity"], + 0,) + except: + self.verbose_print("Failed to extract error code", + self.grid_options["verbosity"], + 0,) + pass + # Try catching the error code and keep track of the unique ones. try: error_code = int( @@ -2436,21 +2447,55 @@ class Population( not error_code in self.grid_options["_failed_systems_error_codes"] ): + print(f"Caught errr code {error_code}") self.grid_options["_failed_systems_error_codes"].append( error_code ) except ValueError: + error_code = None self.verbose_print( "Failed to extract the error-code", self.grid_options["verbosity"], 1, ) + # log failing args? + if self.grid_options['log_failed_systems'] == True and \ + self.grid_options['log_failed_systems_dir'] != None: + path = os.path.join( + self.grid_options['log_failed_systems_dir'] + ) + os.makedirs(path,exist_ok=True) + failed_systems_file = os.path.join( + self.grid_options["log_failed_systems_dir"], + "process_{}.txt".format(self.jobID()), + ) + if self.dir_ok(path): + with self.open( + failed_systems_file, + "a", # append + encoding="utf-8" + ) as f: + now = datetime.datetime.now() + binary_c_cmdline_string = \ + f"system {system_number} at " + \ + now.strftime("%d/%m/%Y %H:%M:%S\n") + \ + self._return_argline(system_dict) + "\n" + f.write(binary_c_cmdline_string) + f.close() + + # Check if we have exceeded the number of errors + print(f"Check failed count {self.grid_options['_failed_count']} vs max {self.grid_options['failed_systems_threshold']}") if ( self.grid_options["_failed_count"] > self.grid_options["failed_systems_threshold"] ): + + # stop evolving systems + self.grid_options['stop_queue'] + + # warn the user the first time we exceed failed_systems_threshold if not self.grid_options["_errors_exceeded"]: self.verbose_print( self._boxed( @@ -2464,21 +2509,7 @@ class Population( ) self.grid_options["_errors_exceeded"] = True - # If not, write the failing systems to files unique to each process - else: - # Write arg lines to file - argstring = self._return_argline(system_dict) - with self.open( - os.path.join( - self.grid_options["tmp_dir"], - "failed_systems", - "process_{}.txt".format(self.process_ID), - ), - "a+", - encoding="utf-8", - ) as f: - f.write(argstring + "\n") - f.close() + else: self.verbose_print( "binary_c output nothing - this is strange. If there is ensemble output being generated then this is fine.", diff --git a/binarycpython/utils/population_extensions/grid_options_defaults.py b/binarycpython/utils/population_extensions/grid_options_defaults.py index ac9da34ee1b11598f78f2804838f22136473e79d..23dcf325f8ba1079848a5e32109058134e8fbd82 100644 --- a/binarycpython/utils/population_extensions/grid_options_defaults.py +++ b/binarycpython/utils/population_extensions/grid_options_defaults.py @@ -163,6 +163,8 @@ class grid_options_defaults: "_failed_prob": 0, # Summed probability of failed systems "failed_systems_threshold": 20, # Maximum failed systems per process allowed to fail before the process stops logging the failing systems. "_failed_systems_error_codes": [], # List to store the unique error codes + "log_failed_systems" : False, # Flag to enable logging of failed systems... + "log_failed_systems_dir" : None, # log them to this dir "_population_id": 0, # Random id of this grid/population run, Unique code for the population. Should be set only once by the controller process. "_total_mass_run": 0, # To count the total mass that thread/process has ran "_total_probability_weighted_mass_run": 0, # To count the total mass * probability for each system that thread/process has ran