Skip to content
Snippets Groups Projects
Commit efd60b25 authored by David Hendriks's avatar David Hendriks
Browse files

updated code in grid and fixing bugs and commented out some code that isnt working yet

parent 581cd002
No related branches found
No related tags found
No related merge requests found
...@@ -19,8 +19,10 @@ def autogen_C_logging_code(logging_dict: dict, verbose: int = 0) -> Optional[str ...@@ -19,8 +19,10 @@ def autogen_C_logging_code(logging_dict: dict, verbose: int = 0) -> Optional[str
Input is a dictionary where the key is the header of that logging line Input is a dictionary where the key is the header of that logging line
and items which are lists of parameters that will be put in that logging line and items which are lists of parameters that will be put in that logging line
The list elements are all appended to 'stardata->' in the autogenerated code.
Example: Example:
input dictionary should look like this:: Input dictionary should look like this::
{'MY_STELLAR_DATA': {'MY_STELLAR_DATA':
[ [
......
This diff is collapsed.
...@@ -129,43 +129,45 @@ grid_options_defaults_dict = { ...@@ -129,43 +129,45 @@ grid_options_defaults_dict = {
# Slurm stuff # Slurm stuff
######################################## ########################################
"slurm": 0, # dont use the slurm by default. 1 = use slurm "slurm": 0, # dont use the slurm by default. 1 = use slurm
"slurm_ntasks": 1, # CPUs required per array job: usually only need this # "slurm_ntasks": 1, # CPUs required per array job: usually only need this
"slurm_command": "", # Command that slurm runs (e.g. evolve or join_datafiles) # "slurm_command": "", # Command that slurm runs (e.g. evolve or join_datafiles)
"slurm_dir": "", # working directory containing scripts output logs etc. # "slurm_dir": "", # working directory containing scripts output logs etc.
"slurm_njobs": 0, # number of scripts; set to 0 as default # "slurm_njobs": 0, # number of scripts; set to 0 as default
"slurm_jobid": "", # slurm job id (%A) # "slurm_jobid": "", # slurm job id (%A)
"slurm_memory": 512, # in MB, the memory use of the job # "slurm_memory": 512, # in MB, the memory use of the job
"slurm_warn_max_memory": 1024, # in MB : warn if mem req. > this # "slurm_warn_max_memory": 1024, # in MB : warn if mem req. > this
"slurm_use_all_node_CPUs": 0, # 1 = use all of a node's CPUs. 0 = use a given amount of CPUs # "slurm_use_all_node_CPUs": 0, # 1 = use all of a node's CPUs. 0 = use a given amount of CPUs
"slurm_postpone_join": 0, # if 1 do not join on slurm, join elsewhere. want to do it off the slurm grid (e.g. with more RAM) # "slurm_postpone_join": 0, # if 1 do not join on slurm, join elsewhere. want to do it off the slurm grid (e.g. with more RAM)
"slurm_jobarrayindex": "", # slurm job array index (%a) # "slurm_jobarrayindex": "", # slurm job array index (%a)
"slurm_jobname": "binary_grid", # default # "slurm_jobname": "binary_grid", # default
"slurm_partition": None, # "slurm_partition": None,
"slurm_time": 0, # total time. 0 = infinite time # "slurm_time": 0, # total time. 0 = infinite time
"slurm_postpone_sbatch": 0, # if 1: don't submit, just make the script # "slurm_postpone_sbatch": 0, # if 1: don't submit, just make the script
"slurm_array": None, # override for --array, useful for rerunning jobs # "slurm_array": None, # override for --array, useful for rerunning jobs
"slurm_use_all_node_CPUs": 0, # if given nodes, set to 1 # "slurm_use_all_node_CPUs": 0, # if given nodes, set to 1
# if given CPUs, set to 0 # # if given CPUs, set to 0
# you will want to use this if your Slurm SelectType is e.g. linear # # you will want to use this if your Slurm SelectType is e.g. linear
# which means it allocates all the CPUs in a node to the job # # which means it allocates all the CPUs in a node to the job
"slurm_control_CPUs": 0, # if so, leave this many for Pythons control (0) # "slurm_control_CPUs": 0, # if so, leave this many for Pythons control (0)
"slurm_array": None, # override for --array, useful for rerunning jobs # "slurm_array": None, # override for --array, useful for rerunning jobs
"slurm_partition": None, # MUST be defined # "slurm_partition": None, # MUST be defined
"slurm_extra_settings": {}, # Place to put extra configuration for the SLURM batch file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so. # "slurm_extra_settings": {}, # Place to put extra configuration for the SLURM batch file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.
######################################## ########################################
# Condor stuff # Condor stuff
######################################## ########################################
"condor": 0, # 1 to use condor, 0 otherwise "condor": 0, # 1 to use condor, 0 otherwise
"condor_command": "", # condor command e.g. "evolve", "join" # "condor_command": "", # condor command e.g. "evolve", "join"
"condor_dir": "", # working directory containing e.g. scripts, output, logs (e.g. should be NFS available to all) # "condor_dir": "", # working directory containing e.g. scripts, output, logs (e.g. should be NFS available to all)
"condor_njobs": "", # number of scripts/jobs that CONDOR will run in total # "condor_njobs": "", # number of scripts/jobs that CONDOR will run in total
"condor_jobid": "", # condor job id # "condor_jobid": "", # condor job id
"condor_postpone_join": 0, # if 1, data is not joined, e.g. if you want to do it off the condor grid (e.g. with more RAM) # "condor_postpone_join": 0, # if 1, data is not joined, e.g. if you want to do it off the condor grid (e.g. with more RAM)
# "condor_join_machine": None, # if defined then this is the machine on which the join command should be launched (must be sshable and not postponed) # # "condor_join_machine": None, # if defined then this is the machine on which the join command should be launched (must be sshable and not postponed)
"condor_join_pwd": "", # directory the join should be in (defaults to $ENV{PWD} if undef) # "condor_join_pwd": "", # directory the join should be in (defaults to $ENV{PWD} if undef)
"condor_memory": 1024, # in MB, the memory use (ImageSize) of the job # "condor_memory": 1024, # in MB, the memory use (ImageSize) of the job
"condor_universe": "vanilla", # usually vanilla universe # "condor_universe": "vanilla", # usually vanilla universe
"condor_extra_settings": {}, # Place to put extra configuration for the CONDOR submit file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so. # "condor_extra_settings": {}, # Place to put extra configuration for the CONDOR submit file. The key and value of the dict will become the key and value of the line in te slurm batch file. Will be put in after all the other settings (and before the command). Take care not to overwrite something without really meaning to do so.
# snapshots and checkpoints # snapshots and checkpoints
# condor_snapshot_on_kill=>0, # if 1 snapshot on SIGKILL before exit # condor_snapshot_on_kill=>0, # if 1 snapshot on SIGKILL before exit
# condor_load_from_snapshot=>0, # if 1 check for snapshot .sv file and load it if found # condor_load_from_snapshot=>0, # if 1 check for snapshot .sv file and load it if found
...@@ -456,7 +458,7 @@ grid_options_descriptions = { ...@@ -456,7 +458,7 @@ grid_options_descriptions = {
"_store_memaddr": "Memory adress of the store object for binary_c.", "_store_memaddr": "Memory adress of the store object for binary_c.",
"failed_systems_threshold": "Variable storing the maximum amount of systems that are allowed to fail before logging their commandline arguments to failed_systems log files", "failed_systems_threshold": "Variable storing the maximum amount of systems that are allowed to fail before logging their commandline arguments to failed_systems log files",
"parse_function": "Function that the user can provide to handle the output the binary_c. This function has to take the arguments (self, output). Its best not to return anything in this function, and just store stuff in the grid_options['results'] dictionary, or just output results to a file", "parse_function": "Function that the user can provide to handle the output the binary_c. This function has to take the arguments (self, output). Its best not to return anything in this function, and just store stuff in the grid_options['results'] dictionary, or just output results to a file",
"condor": "Int flag whether to use a condor type population evolution.", # TODO: describe this in more detail "condor": "Int flag whether to use a condor type population evolution. Not implemented yet.", # TODO: describe this in more detail
"slurm": "Int flag whether to use a slurm type population evolution.", # TODO: describe this in more detail "slurm": "Int flag whether to use a slurm type population evolution.", # TODO: describe this in more detail
"weight": "Weight factor for each system. The calculated probability is mulitplied by this. If the user wants each system to be repeated several times, then this variable should not be changed, rather change the _repeat variable instead, as that handles the reduction in probability per system. This is useful for systems that have a process with some random element in it.", # TODO: add more info here, regarding the evolution splitting. "weight": "Weight factor for each system. The calculated probability is mulitplied by this. If the user wants each system to be repeated several times, then this variable should not be changed, rather change the _repeat variable instead, as that handles the reduction in probability per system. This is useful for systems that have a process with some random element in it.", # TODO: add more info here, regarding the evolution splitting.
"repeat": "Factor of how many times a system should be repeated. Consider the evolution splitting binary_c argument for supernovae kick repeating.", # TODO: make sure this is used. "repeat": "Factor of how many times a system should be repeated. Consider the evolution splitting binary_c argument for supernovae kick repeating.", # TODO: make sure this is used.
......
""" # """
File containing functions for HPC computing, distributed tasks on clusters etc. # File containing functions for HPC computing, distributed tasks on clusters etc.
Functions that the slurm and condor subroutines of the population object use. # Functions that the slurm and condor subroutines of the population object use.
Mainly divided in 2 sections: Slurm and Condor # Mainly divided in 2 sections: Slurm and Condor
""" # """
import os # import os
import sys # import sys
import time # import time
import subprocess # import subprocess
from typing import Union # from typing import Union
import __main__ as main # import __main__ as main
def get_slurm_version() -> Union[str, None]: # def get_slurm_version() -> Union[str, None]:
""" # """
Function that checks whether slurm is installed and returns the version if its installed. # Function that checks whether slurm is installed and returns the version if its installed.
Only tested this with slurm v17+ # Only tested this with slurm v17+
Returns: # Returns:
slurm version, or None # slurm version, or None
""" # """
slurm_version = None # slurm_version = None
try: # try:
slurm_version = ( # slurm_version = (
subprocess.run(["sinfo", "-V"], stdout=subprocess.PIPE, check=True) # subprocess.run(["sinfo", "-V"], stdout=subprocess.PIPE, check=True)
.stdout.decode("utf-8") # .stdout.decode("utf-8")
.split() # .split()
)[1] # )[1]
except FileNotFoundError as err: # except FileNotFoundError as err:
print(err) # print(err)
print(err.args) # print(err.args)
print("Slurm is not installed or not loaded") # print("Slurm is not installed or not loaded")
except Exception as err: # except Exception as err:
print(err) # print(err)
print(err.args) # print(err.args)
print("Unknown error, contact me about this") # print("Unknown error, contact me about this")
return slurm_version # return slurm_version
def get_condor_version() -> Union[str, None]: # def get_condor_version() -> Union[str, None]:
""" # """
Function that checks whether slurm is installed and returns the version if its installed. # Function that checks whether slurm is installed and returns the version if its installed.
otherwise returns None # otherwise returns None
Result has to be condor v8 or higher # Result has to be condor v8 or higher
Returns: # Returns:
condor version, or None # condor version, or None
""" # """
condor_version = None # condor_version = None
try: # try:
condor_version = ( # condor_version = (
subprocess.run( # subprocess.run(
["condor_q", "--version"], stdout=subprocess.PIPE, check=True # ["condor_q", "--version"], stdout=subprocess.PIPE, check=True
) # )
.stdout.decode("utf-8") # .stdout.decode("utf-8")
.split() # .split()
)[1] # )[1]
except FileNotFoundError as err: # except FileNotFoundError as err:
print("Slurm is not installed or not loaded: ") # print("Slurm is not installed or not loaded: ")
print(err) # print(err)
print(err.args) # print(err.args)
except Exception as err: # except Exception as err:
print("Unknown error, contact me about this: ") # print("Unknown error, contact me about this: ")
print(err) # print(err)
print(err.args) # print(err.args)
return condor_version # return condor_version
def create_directories_hpc(working_dir: str) -> None: # def create_directories_hpc(working_dir: str) -> None:
""" # """
Function to create a set of directories, given a root directory # Function to create a set of directories, given a root directory
These directories will contain stuff for the HPC runs # These directories will contain stuff for the HPC runs
Args: # Args:
working_dir: main working directory of the run. Under this directory all the dirs will be created # working_dir: main working directory of the run. Under this directory all the dirs will be created
""" # """
# Check if working_dir exists # # Check if working_dir exists
if not os.path.isdir(working_dir): # if not os.path.isdir(working_dir):
print("Error. Working directory {} does not exist! Aborting") # print("Error. Working directory {} does not exist! Aborting")
raise ValueError # raise ValueError
directories_list = [ # directories_list = [
"scripts", # "scripts",
"stdout", # "stdout",
"stderr", # "stderr",
"results", # "results",
"logs", # "logs",
"status", # "status",
"joining", # "joining",
] # ]
# Make directories. # # Make directories.
for subdir in directories_list: # for subdir in directories_list:
full_path = os.path.join(working_dir, subdir) # full_path = os.path.join(working_dir, subdir)
os.makedirs(full_path, exist_ok=True) # os.makedirs(full_path, exist_ok=True)
# Since the directories are probably made on some mount which has to go over NFS # # Since the directories are probably made on some mount which has to go over NFS
# we should explicitly check if they are created # # we should explicitly check if they are created
print("Checking if creating the directories has finished...") # print("Checking if creating the directories has finished...")
directories_exist = False # directories_exist = False
while directories_exist: # while directories_exist:
directories_exist = True # directories_exist = True
for subdir in directories_list: # for subdir in directories_list:
full_path = os.path.join(working_dir, subdir) # full_path = os.path.join(working_dir, subdir)
if not os.path.isdir(full_path): # if not os.path.isdir(full_path):
time.sleep(1) # time.sleep(1)
directories_exist = False # directories_exist = False
print("..Finished! Directories exist.") # print("..Finished! Directories exist.")
def path_of_calling_script() -> str: # def path_of_calling_script() -> str:
""" # """
Function to get the name of the script the user executes. # Function to get the name of the script the user executes.
TODO: fix this function. seems not to work properly. # TODO: fix this function. seems not to work properly.
""" # """
return main.__file__ # return main.__file__
def get_python_details() -> dict: # def get_python_details() -> dict:
""" # """
Function to get some info about the used python version and virtualenv etc # Function to get some info about the used python version and virtualenv etc
Returns: # Returns:
dictionary with python executable, virtual environment and version information. # dictionary with python executable, virtual environment and version information.
""" # """
python_info_dict = {} # python_info_dict = {}
# # #
python_info_dict["virtualenv"] = os.getenv("VIRTUAL_ENV") # python_info_dict["virtualenv"] = os.getenv("VIRTUAL_ENV")
python_info_dict["executable"] = sys.executable # python_info_dict["executable"] = sys.executable
python_info_dict["version"] = sys.version # python_info_dict["version"] = sys.version
return python_info_dict # return python_info_dict
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment