From ea373b68db046b260b552611cb205318f55bbd52 Mon Sep 17 00:00:00 2001
From: Izzard <ri0005@orca.eps.surrey.ac.uk>
Date: Thu, 18 Nov 2021 10:45:51 +0000
Subject: [PATCH] cleaned up HPC function names so they all start with HPC_

the joiningfile is now made by whichever job gets to this task first, so the error where it wasn't fully made (when jobs were slowly run from the HPC queue) should not happen now unless it takes a long time to write that one short file. I guess we could put in a checkfile if required, but this should be *very* rare.
---
 binarycpython/utils/HPC.py                   | 179 ++++++++++++++-----
 binarycpython/utils/condor.py                |   2 +-
 binarycpython/utils/dataIO.py                |   6 +-
 binarycpython/utils/grid.py                  |  46 +++--
 binarycpython/utils/grid_options_defaults.py |   2 +-
 binarycpython/utils/gridcode.py              |   4 +-
 binarycpython/utils/metadata.py              |   2 +-
 binarycpython/utils/slurm.py                 |   2 +-
 8 files changed, 179 insertions(+), 64 deletions(-)

diff --git a/binarycpython/utils/HPC.py b/binarycpython/utils/HPC.py
index 83c41af8c..f91807df1 100644
--- a/binarycpython/utils/HPC.py
+++ b/binarycpython/utils/HPC.py
@@ -8,10 +8,11 @@
     other modules can use a single API rather than have to choose to use the Slurm or
     Condor API.
 """
-
+import datetime
 import glob
 import os
 import pathlib
+import time
 
 from binarycpython.utils.slurm import slurm
 from binarycpython.utils.condor import condor
@@ -22,6 +23,69 @@ class HPC(condor,slurm):
         # don't do anything: we just inherit from this class
         return
 
+    def HPC_njobs(self):
+        """
+        Function to return the number of jobs this HPC jobs will use, as an int.
+        """
+        if self.grid_options['slurm'] > 0:
+            n = self.grid_options['slurm_njobs']
+        elif self.grid_options['condor'] > 0:
+            n = self.grid_options['condor_njobs']
+        else:
+            n = None
+        return int(n)
+        
+    def HPC_make_joiningfile(self,
+                             id=None,
+                             dir=None,
+                             n=None,
+                             overwrite=False,
+                             error_on_overwrite=False):
+        """
+        Function to make the joiningfile file that contains the filenames of results from each job. When all these exist, we can join.
+
+        Note: you normally don't need to set any of the option arguments.
+
+        Args:
+            dir : the HPC directory, or self.HPC_dir() if None (default=None).
+            n : the number of jobs, or self.HPC_njobs() if None (default=None).
+            id : the job ID number, or self.HPC_jobID_tuple()[0] if None (default=None).
+            overwrite : if True, overwrite an existing joiningfile (default=False)
+            error_on_overwite : if True, and we try to overwrite, issue and error and exit (default=False)
+        
+        """
+
+        # defaults
+        if dir is None:
+            dir = self.HPC_dir()
+        if n is None:
+            n = self.HPC_njobs()
+        if id is None:
+            id = self.HPC_jobID_tuple()[0]
+
+        # make path and filename
+        prefix = os.path.join(dir,'results')
+        file = os.path.join(prefix,id + '.all')
+
+        # check the joiningfile doesn't exist
+        if not overwrite and os.path.isfile(file):
+            print("Cannot make joiningfile at {file} because it already exists.".format(file=file))
+            # perhaps cause an error if it does
+            if error_on_overwrite:
+                self.exit(code=1)
+            
+        # write the joiningfile
+        print("Making joiningfile at {file} with range 0 to {n}".format(
+            file=file,
+            n=n
+        ))
+        with open(file,"w",encoding="utf-8") as f:
+            for i in range(0,n):
+                f.write(os.path.join(prefix,
+                                     "{i}.gz\n".format(i=i)))
+            f.close()
+        return
+                    
     def HPC_joinfiles(self,joinlist=None):
         """
         Function to load in the joinlist to a list and return it.
@@ -45,8 +109,8 @@ class HPC(condor,slurm):
 
             if self.grid_options['HPC_prepend_dir_to_joinlist'] is True:
                 list = [os.path.join(prefix,file) for file in list]
-        except:
-            print("Failed to open joinlist at {list}".format(list=joinlist))
+        except Exception as e:
+            print("Failed to open joinlist at {list} : {e}".format(list=joinlist,e=e))
             self.exit(code=1)
         return list
 
@@ -65,10 +129,11 @@ class HPC(condor,slurm):
         Check the joinfiles to make sure they all exist
         and their .saved equivalents also exist
         """
+        print("HPC check if we can join at {}",self.now())
         if self.grid_options['HPC_force_join'] == 0 and \
            os.path.exists(joiningfile):
             if vb:
-                print("cannot join: joiningfile exists at {}".format(joiningfile))
+                print("cannot join : joiningfile exists at {} (check 1)".format(joiningfile))
             return False
         elif vb:
             print("joiningfile (at {}) does not exist".format(joiningfile))
@@ -77,14 +142,14 @@ class HPC(condor,slurm):
                 print("check for {}".format(file))
             if os.path.exists(file) == False:
                 if vb:
-                    print("cannot join: {} does not exist".format(file))
+                    print("cannot join : file \"{}\" does not exist".format(file))
                 return False
             savedfile = file + '.saved'
             if vb:
                 print("check for {}".format(savedfile))
             if os.path.exists(savedfile) == False:
                 if vb:
-                    print("cannot join: {} does not exist".format(savedfile))
+                    print("cannot join : savedfile \"{}\" does not exist".format(savedfile))
                 return False
 
             # found both files
@@ -97,17 +162,17 @@ class HPC(condor,slurm):
             x = True
         elif os.path.exists(joiningfile):
             if vb:
-                print("cannot join: joiningfile exists at {}".format(joiningfile))
+                print("cannot join: joiningfile exists at {} (check 2)".format(joiningfile))
             x = False
         elif vb:
-            print("joiningfile does not exist : can join")
+            print("joiningfile at {} does not exist : can join".format(joiningfile))
             x = True
 
         if vb:
             print("returning {} from HPC_can_join()".format(x))
         return x
 
-    def HPCjob(self):
+    def HPC_job(self):
         """
         Function to return True if we're running an HPC (Slurm or Condor) job, False otherwise.
         """
@@ -117,8 +182,21 @@ class HPC(condor,slurm):
         else:
             x = False
         return x
-
-    def HPCjobtype(self):
+    
+    def HPC_job_task(self):
+        """
+        Function to return the HPC task number, which is 1 when setting 
+        up and running the scripts, 2 when joining data.
+        """
+        if self.grid_options['slurm'] > 0:
+            x = self.grid_options['slurm']
+        elif self.grid_options['condor'] > 0:
+            x = self.grid_options['condor']
+        else:
+            x = 0
+        return x
+    
+    def HPC_job_type(self):
         """
         Function to return a string telling us the type of an HPC job, i.e.
         "slurm", "condor" or "None".
@@ -131,7 +209,7 @@ class HPC(condor,slurm):
             type = "None"
         return type
 
-    def HPCjobID(self):
+    def HPC_jobID(self):
         """
         Function to return an HPC (Slurm or Condor) job id in the form x.y. Returns None if not an HPC job.
         """
@@ -144,16 +222,22 @@ class HPC(condor,slurm):
             id = None
         return id
 
-    def HPCjobIDtuple(self):
+    def HPC_jobID_tuple(self):
         """
-        Return the job ID as a tuple, (x,y), or (None,None) on failure
+        Return the job ID as a tuple of ints, (x,y), or (None,None) on failure
         """
-        id = self.HPCjobID()
-        if id:
-            t = tuple(id.split('.'))
-        else:
-            t = (None,None)
-        return t
+        id = self.HPC_jobID()
+
+        if id.startswith('None'):
+            t = [None,None]
+        elif self.HPC_job():
+            print("JOBID",id)
+            t = id.split('.')
+            if not t[0]:
+                t[0] = None
+            if not t[1]:
+                t[1] = None
+        return tuple(t)
 
     def HPC_set_status(self,string):
         """
@@ -202,18 +286,33 @@ class HPC(condor,slurm):
             dirs = []
         return dirs
 
-    def HPCgrid(self):
+          
+    def HPC_grid(self,makejoiningfile=True):
         """
         Function to call the appropriate HPC grid function
-        (e.g. Slurm or Condor) and return what it returns
+        (e.g. Slurm or Condor) and return what it returns.
+
+        Args:
+            makejoiningfile : if True, and we're the first job with self.HPC_task() == 2, we build the joiningfile. (default=True) This option exists in case you don't want to overwrite an existing joiningfile, or want to build it in another way (e.g. in the HPC scripts).
         """
+        id = self.HPC_jobID_tuple()[0]
+
+
+        if makejoiningfile and \
+           self.HPC_job_task() == 2 and \
+           id is not None:
+            print("Making joiningfile from HPC_grid().")
+            self.HPC_make_joiningfile()
+        
         if self.grid_options['slurm'] > 0:
-            return self.slurm_grid()
+            x = self.slurm_grid()
         elif self.grid_options['condor'] > 0:
-            return self.condor_grid()
+            x = self.condor_grid()
         else:
-            return None # should not happen
+            x =None # should not happen
 
+        return x
+            
     def HPC_check_requirements(self):
         """
         Function to check HPC option requirements have been met. Returns a tuple: (True,"") if all is ok, (False,<warning string>) otherwise.
@@ -265,10 +364,10 @@ class HPC(condor,slurm):
         """
         Set grid_options['restore_from_snapshot_file'] so that we restore data from existing
         an HPC run if self.grid_options[type+'_restart_dir'], where type is "slurm" or "condor",
-        is provided, otherwise do nothing. This only works if grid_options[type] == 2, which is
+        is provided, otherwise do nothing. This only works if grid_options[type] == self.HPC_job_task() == 2, which is
         the run-grid stage of the process.
         """
-        type = self.HPCjobtype()
+        type = self.HPC_job_type()
         if type is None:
             return
         key = type + '_restart_dir'
@@ -279,10 +378,10 @@ class HPC(condor,slurm):
         if dir is None:
             return
         # get HPC job index
-        index = self.HPCjobIDtuple()[1]
+        index = self.HPC_jobID_tuple()[1]
         if index is None:
             return
-        if self.grid_options[type] == 2:
+        if self.HPC_job_task() == 2: # (same as) self.grid_options[type] == 2:
             old_id = self.HPC_id_from_dir(dir)
             print("Restart from dir {dir} which was has (old) ID {old_id}, we are job index {index}".format(
                 dir=dir,
@@ -292,11 +391,6 @@ class HPC(condor,slurm):
 
             # check status: if "finished", we don't have to do anything
             status = self.HPC_get_status(dir=dir)
-            #file = os.path.join(dir,
-            #                    'status',
-            #                    "{id}.{index}".format(id=old_id,
-            #                                          index=index))
-            #status = open(file,encoding='utf-8').read()
 
             if status == 'finished':
                 print("Status is finished, cannot and do not need to restart.")
@@ -334,15 +428,16 @@ class HPC(condor,slurm):
         else:
             # our job has finished
             joinfiles = self.HPC_joinfiles()
-            joiningfile = self.HPCpath('joining')
-            print("Joinfiles: ",joinfiles)
-            print("Joingingfiles: ",joiningfile)
+            joiningfile = self.HPC_path('joining')
+            print("Joinfiles : ",joinfiles)
+            print("Joingingfiles : ",joiningfile)
             if self.HPC_can_join(joinfiles,joiningfile,vb=True):
                 # join object files
                 print("We can join")
                 try:
                     # touch joiningfile
                     if self.grid_options['HPC_force_join'] == 0:
+                        print("Making joiningfile at {}",joiningfile)
                         pathlib.Path(joiningfile).touch(exist_ok=False)
                     try:
                         print("Calling HPC_join_from_files()")
@@ -353,15 +448,15 @@ class HPC(condor,slurm):
                         # values we just loaded
                     self.grid_options['do_analytics'] = False
                     return
-                except:
-                    print("pass")
+                except Exception as e:
+                    print("pass {}",e)
                     pass
             else:
                 print("cannot join : other tasks are not yet finished\n")
                 print("Finished this job : exiting")
         self.exit(code=1)
 
-    def HPCpath(self,path):
+    def HPC_path(self,path):
         """
         Function to file the filename of this HPC job's file at path.
         """
@@ -377,10 +472,10 @@ class HPC(condor,slurm):
         """
         Function to return an HPC job's snapshot filename.
         """
-        if self.HPCjob():
+        if self.HPC_job():
             file = os.path.join(self.HPC_dir,
                                 'snapshots',
-                                self.HPCjobID() + '.gz')
+                                self.HPC_jobID() + '.gz')
         else:
             file = None
         return file
diff --git a/binarycpython/utils/condor.py b/binarycpython/utils/condor.py
index 4f1d6eb5b..85e621210 100644
--- a/binarycpython/utils/condor.py
+++ b/binarycpython/utils/condor.py
@@ -283,7 +283,7 @@ export BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME=`{date}`
 echo \"running\" > {condor_dir}/status/$ClusterID.$ProcessID
 
 # make list of files which is checked for joining
-echo {condor_dir}/results/$ClusterID.$Process.gz >> {condor_dir}/results/$ClusterID.all
+# echo {condor_dir}/results/$ClusterID.$Process.gz >> {condor_dir}/results/$ClusterID.all
 
 # run grid of stars and, if this returns 0, set status to finished
 {grid_command} condor=2 evolution_type=grid condor_ClusterID=$ClusterID condor_Process=$Process save_population_object={condor_dir}/results/$ClusterID.$Process.gz && echo -n \"finished\" > {condor_dir}/status/$ClusterID.$ProcessID && echo """.format(
diff --git a/binarycpython/utils/dataIO.py b/binarycpython/utils/dataIO.py
index 88396d84f..6bf7f703f 100644
--- a/binarycpython/utils/dataIO.py
+++ b/binarycpython/utils/dataIO.py
@@ -92,7 +92,7 @@ class dataIO():
                 object.grid_ensemble_results["metadata"] = {}
 
             # add datestamp
-            object.grid_ensemble_results["metadata"]['save_population_time'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
+            object.grid_ensemble_results["metadata"]['save_population_time'] = self.now() 
 
             # add extra metadata
             object.add_system_metadata()
@@ -230,7 +230,7 @@ class dataIO():
         """
         Automatically choose the snapshot filename.
         """
-        if self.HPCjob():
+        if self.HPC_job():
             return self.HPC_snapshot_filename()
         else:
             file = os.path.join(self.grid_options['tmp_dir'],
@@ -485,5 +485,5 @@ class dataIO():
                 f.close()
 
         # custom logging functions for HPC jobs
-        if self.HPCjob():
+        if self.HPC_job():
             self.HPC_set_status(string)
diff --git a/binarycpython/utils/grid.py b/binarycpython/utils/grid.py
index 5b6b1a6bb..bcd8861a7 100644
--- a/binarycpython/utils/grid.py
+++ b/binarycpython/utils/grid.py
@@ -212,10 +212,10 @@ class Population(analytics,
         Function to return the job ID number of this process
 
         Normal processes return their process ID (PID)
-        HPC processes return whatever HPCjobID() gives.
+        HPC processes return whatever HPC_jobID() gives.
         """
-        if self.HPCjob():
-            jobID = self.HPCjobID()
+        if self.HPC_job():
+            jobID = self.HPC_jobID()
         else:
             jobID = "{}".format(self.process_ID)
         return jobID
@@ -395,12 +395,13 @@ class Population(analytics,
                                 try:
                                     value = type(old_value)(value)
                                     self.verbose_print("Success!", self.grid_options["verbosity"], 2)
-                                except:
-                                    print("Failed to convert {param} value with type {type}: old_value is '{old}', new value is '{new}'".format(
+                                except Exception as e:
+                                    print("Failed to convert {param} value with type {type}: old_value is '{old}', new value is '{new}', {e}".format(
                                         param=parameter,
                                         old=old_value,
                                         type=type(old_value),
-                                        new=split[1]
+                                        new=split[1],
+                                        e=e
                                     ))
                                     self.exit(code=1)
                                     
@@ -581,9 +582,7 @@ class Population(analytics,
             if self.custom_options.get("data_dir", None):
                 if not self.custom_options.get("base_filename", None):
                     base_name = "simulation_{}".format(
-                        datetime.datetime.strftime(
-                            datetime.datetime.now(), "%Y%m%d_%H%M%S"
-                        )
+                        self.now(style="nospace")
                     )
                 else:
                     base_name = os.path.splitext(self.custom_options["base_filename"])[
@@ -721,7 +720,7 @@ class Population(analytics,
             self.exit(code=1)
 
         # check any HPC requirements are met
-        if self.HPCjob() and not self.HPC_check_requirements()[0]:
+        if self.HPC_job() and not self.HPC_check_requirements()[0]:
             print(self.HPC_check_requirements()[1])
             self.exit(code=1)
 
@@ -790,10 +789,10 @@ class Population(analytics,
         # Just to make sure we don't have stuff from a previous run hanging around
         self._pre_run_setup()
 
-        if self.HPCjob():
+        if self.HPC_job():
             # run HPC grid: if this returns True, then exit immediately
             self.grid_options["symlink_latest_gridcode"] = False
-            if self.HPCgrid():
+            if self.HPC_grid():
                 self.exit(code=0)
 
         if self.grid_options['evolution_type'] == 'join':
@@ -824,7 +823,7 @@ class Population(analytics,
 
         # if we're running an HPC grid, exit here
         # unless we're joining
-        if self.HPCgrid() and \
+        if self.HPC_grid() and \
            self.grid_options['evolution_type'] != 'join':
             self.exit()
 
@@ -2250,3 +2249,24 @@ class Population(analytics,
                 self.grid_options["verbosity"],
                 3,
             )
+            
+    def now(self,style=None):
+        """
+        convenience function to return a string of the current time, 
+        using the format "%m/%d/%Y %H:%M:%S"
+
+        Args: 
+            style : if "nospace" then return the date/time with the format 
+            "%Y%m%d_%H%M%S"
+        """
+        if style == 'nospace':
+            return datetime.datetime.strftime(
+                datetime.datetime.now(),
+                "%Y%m%d_%H%M%S"
+            )
+        else:
+            return datetime.datetime.strftime(
+                datetime.datetime.now(),
+                "%m/%d/%Y %H:%M:%S"
+            )
+        
diff --git a/binarycpython/utils/grid_options_defaults.py b/binarycpython/utils/grid_options_defaults.py
index e1cc9a492..cdca85074 100644
--- a/binarycpython/utils/grid_options_defaults.py
+++ b/binarycpython/utils/grid_options_defaults.py
@@ -65,7 +65,7 @@ class grid_options_defaults():
             "original_command_line":  os.getenv('BINARY_C_PYTHON_ORIGINAL_CMD_LINE'),
             "working_diretory": os.getcwd(),
             "original_working_diretory": os.getenv('BINARY_C_PYTHON_ORIGINAL_WD'),
-            "start_time": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
+            "start_time": self.now(),
             "original_submission_time" : os.getenv('BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME'),
 
             ##########################
diff --git a/binarycpython/utils/gridcode.py b/binarycpython/utils/gridcode.py
index 1ec3fe9f8..03c9e9ff8 100644
--- a/binarycpython/utils/gridcode.py
+++ b/binarycpython/utils/gridcode.py
@@ -26,7 +26,7 @@ class gridcode():
         """
         Returns a filename for the gridcode.
         """
-        if self.HPCjob():
+        if self.HPC_job():
             filename = os.path.join(
                 self.grid_options["tmp_dir"],
                 "binary_c_grid_{population_id}.{jobid}.py".format(
@@ -614,7 +614,7 @@ class gridcode():
             file.write(self.code_string)
 
         # perhaps create symlink
-        if not self.HPCjob() and self.grid_options["symlink_latest_gridcode"]:
+        if not self.HPC_job() and self.grid_options["symlink_latest_gridcode"]:
             global _count
             symlink = os.path.join(
                 self.grid_options["tmp_dir"], "binary_c_grid-latest" + str(_count)
diff --git a/binarycpython/utils/metadata.py b/binarycpython/utils/metadata.py
index 45e6cbf7c..e35bdb995 100644
--- a/binarycpython/utils/metadata.py
+++ b/binarycpython/utils/metadata.py
@@ -28,7 +28,7 @@ class metadata():
             self.grid_ensemble_results["metadata"] = {}
 
         # add date
-        self.grid_ensemble_results["metadata"]['date'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
+        self.grid_ensemble_results["metadata"]['date'] = self.now()
 
         # add platform and build information
         try:
diff --git a/binarycpython/utils/slurm.py b/binarycpython/utils/slurm.py
index cd03e4166..8f2068393 100644
--- a/binarycpython/utils/slurm.py
+++ b/binarycpython/utils/slurm.py
@@ -299,7 +299,7 @@ export BINARY_C_PYTHON_ORIGINAL_SUBMISSION_TIME=`{date}`
 echo \"running\" > {slurm_dir}/status/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID
 
 # make list of files which is checked for joining
-echo {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz >> {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.all
+# echo {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz >> {slurm_dir}/results/$SLURM_ARRAY_JOB_ID.all
 
 # run grid of stars and, if this returns 0, set status to finished
 {grid_command} slurm=2 evolution_type=grid slurm_jobid=$SLURM_ARRAY_JOB_ID slurm_jobarrayindex=$SLURM_ARRAY_TASK_ID save_population_object={slurm_dir}/results/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID.gz && echo -n \"finished\" > {slurm_dir}/status/$SLURM_ARRAY_JOB_ID.$SLURM_ARRAY_TASK_ID && echo """.format(
-- 
GitLab