package binary_grid::slurm;

$|=1;
use 5.16.0;
use strict;
use IO::Handle;
use common::sense;
use feature qw(say state);

################################################################
######### >>> slurm extensions for binary_grid <<< ############
################################################################
#
# Written by Robert Izzard 
# (r.izzard@surrey.ac.uk or via gmail: rob.izzard) 
# (c) 2000-2018 and onwards into the 21st century.

# version number should match the binary_grid version
our $VERSION = '2.1.5';

############################################################
#
# Changelog
#
# 2.0 Updated to use binary_grid2
# 2.0pre29 Updated to use slurm: note slurm ONLY works with binary_grid2
# 2.1.5 "say state" features added for perl5.31.6
#
# TODO:
#
# Snapshots are not yet allowed. The condor module does allow them
# and could probably be reasonably simply ported over.
#
############################################################

$|=1; # autoflush please (for log output)

# Standard modules:
# if you don't have them then check http://www.cpan.org/ or, better, 
# use cpanm or packages to install them.
use Carp qw(confess);
use IO::File;
use Time::HiRes qw(sleep);
use File::Spec;
use File::Basename;
use rob_misc qw(mkdirhier slurp);
use FindBin qw($Bin $Script $RealBin);
use lib "$Bin/../lib";
use IO::Interactive;

# module setup
require Exporter;
our @ISA = qw(Exporter);

# Functions to export
my @funcs= qw( &slurm_grid &slurm_job_hook);

our %EXPORT_TAGS = ( 'all' => [ @funcs ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = @funcs;
use vars qw(@ISA @EXPORT @EXPORT_OK);

# verbose logging
my $vb=1; # 0 = no logging, 1= logging
my $vbmirror=(! IO::Interactive::is_interactive()); # mirror stderr if not a terminal
my @funcs_overridden = qw(
        slurm_grid
 slurm_workingdir
 
 write_slurm_script
 set_slurm_job_status
 get_slurm_job_status
 check_and_merge_slurm_jobs
 check_slurm_jobs_done
 merge_slurm_jobs
 slurm_submit
 jobid
 slurm_script_data
 datafiles_string_comma
 datafiles_string
 slurm_rerun_command
 slurm_check_joining_file
 slurm_job_hook
 grid_interrupted
 check_for_saved_snapshot
 pre_load_snapshot
 post_load_snapshot
 checkpoint
 increment_checkpoint_time
 output_allowed
        );
# delete the subroutines we will override
foreach my $subroutine (@funcs_overridden)
{
    print STDERR "Define binary_grid2::slurm\-\>$subroutine\n";
    eval "undef \&binary_grid2::$subroutine; "; 
    if($@)
    {
        print $@;
        exit;
    }
}
############################################################
# generic internal functions
############################################################


sub vb
{
    return if($vb==0);

    # set autoflush first time (just in case)
    state $inited=0;
    if($inited==0)
    {
	$inited=1;
	STDOUT->autoflush(1);
	STDERR->autoflush(1);
    }

    # join args to make string, force newline at the end (but just one)
    my $s="@_";
    chomp $s;
    $s .= "\n";

    # output
    print STDOUT 'slurm_grid(vb,stdout): ',$s;
    print STDERR 'slurm_grid(vb,stdout): ',$s if($vbmirror);
}


############################################################
# slurm functions for binary_grid2
############################################################

*binary_grid2::slurm_grid = sub
{
    my $self = shift;

    # grid wrapper for slurm : should be called exactly like
    # flexigrid() but instead distributes the work to slurm jobs
    vb("Bin $Bin : script $Script : RealBin $RealBin : perl executable $^X");

    # disable timeouts and non-stdout/stderr logging
    $self->{_grid_options}{timeout}=0;
    $self->{_grid_options}{thread_presleep}=0;
    $self->{_grid_options}{nfs_sleep}=0;
    $self->{_grid_options}{log_args}=0;
    $self->{_grid_options}{tvb}=0;
    $self->{_grid_options}{no_gridcode_dump}=1;
    $self->{_bse_options}{log_filename}='/dev/null';

    # slurm_command is either:
    #
    # empty : this is the controller script, run by the user
    #         on the slurm master machine
    #
    # run_flexigrid : in which case the flexigrid is called
    #                 to evolve the stellar population
    #
    #
    # join : this is called to join datafiles saved
    #        by previous 'run_flexigrid' calls
    #
 
    vb("slurm_command is $self->{_grid_options}{slurm_command}");

    # how many jobs in the array?
    my $njobs = $self->{_grid_options}{slurm_njobs};
    if(!defined $njobs)
    {
        print STDERR "You must specify, e.g. in grid_option slurm_njobs, the number of jobs\n";
        exit;
    }

    if($self->{_grid_options}{slurm_command} eq 'run_flexigrid')
    {
	# in a job : just run flexigrid on one CPU
	vb("call flexigrid(1)");
	
	# how many CPUs?
	my $n;
	my $ncpus = rob_misc::ncpus();
	if($self->{_grid_options}{slurm_use_all_node_CPUs})
	{
	    $n=rob_misc::MAX(1,$ncpus - $self->{_grid_options}{slurm_control_CPUs}//0);
	}
	else
	{
	    $n=1;
	}
	$self->{_grid_options}{nthreads} = $n;
	$self->{_flexigrid}{nthreads} = $n;
	printf "Use %d CPUs per node (slurm_use_all_node_CPUs = %d, ncpus = %d)\n",
	    $n,
	    $self->{_grid_options}{slurm_use_all_node_CPUs},
	    $ncpus;
	return $self->flexigrid(1);
    }
    elsif($self->{_grid_options}{slurm_command} eq 'join')
    {
        vb("Joining data with automatic algorithm");
        # (auto) join data : don't run the grid

        my $joinfile = $self->{_grid_options}{slurm_dir}.'/joining/'.
            $self->{_grid_options}{slurm_jobid};

        # check that we're not already joining
        if(-f $joinfile)
        {
            vb("Another process is already joining : exiting (1)\n");
            return;
        }
        
        # check that we can join
        vb("Checking jobs to see if we can join\n");
        for my $n (1..$njobs)
        {
            my $file = "$self->{_grid_options}{slurm_dir}/status/$self->{_grid_options}{slurm_jobid}.$n";
            print "Check file $file (jobid $self->{_grid_options}{slurm_jobid}, n $n)\n";
            my $status = rob_misc::slurp($file);
            chomp $status;
            if($status ne 'finished')
            {
                print "  ... not finished\n";
                return;
            }
            else
            {
                print "  ... finished\n";
            }
        }

        # check again that we're not already joining
        if(-f $joinfile)
        {
            vb("Another process is already joining : exiting (2)\n");
            return;
        }

        # we are not joining, and can join : touch the joinfile
        rob_misc::touch($joinfile);
        vb("touched joinfile\n");

	$self->{_grid_options}{rungrid}=0;

        # make datafiles list from 1..$njobs
        my @datafiles;
        for my $n (1..$njobs)
        {
            push(@datafiles,
                 "$self->{_grid_options}{slurm_dir}/results/$self->{_grid_options}{slurm_jobid}.$n");
        }

        # make datafiles string
        $self->{_grid_options}{merge_datafiles} = 
            join(',',@datafiles);
        print "merge_datafiles = $self->{_grid_options}{merge_datafiles}\n";

	# call flexigrid to join and process the data and output
	vb("Calling flexigrid");
	my @x = $self->flexigrid(1);

        # on return, leave the joinfile in place so we don't repeat,
        # and give control back to the grid script
	print "Slurm flexigrid done and joins, return @x to output\n";
	return @x;
    }
    else
    {
	# something else or no command 
        # We're in the main controller script
	# so we have to set up the slurm batch script.
	vb("In main controller thread: set up slurm batch script");

        # check some variables are defined
        if(!defined $self->{_grid_options}{slurm_partition})
        {
            print "You must set the grid_option 'slurm_partition'\n";
        }

	# set up working directory
	vb("Set workingdir()");
	$self->slurm_workingdir();

	say "Use $njobs slurm jobs\n";
	vb("Use $njobs slurm jobs");
	
        # set defaults if not set
        $self->{_grid_options}{slurm_memory} //= '1024';

        if($self->{_grid_options}{slurm_memory} > $self->{_grid_options}{slurm_warn_max_memory})
        {
            printf "WARNING: you want to use > %d MB of RAM : this is unlikely to be correct. If you believe it is, set slurm_warn_max_memory to something very large (it is currently %d MB)\n",
            $self->{_grid_options}{slurm_memory},
            $self->{_grid_options}{slurm_warn_max_memory};
            exit;
        }
        
        # use megabytes
        $self->{_grid_options}{slurm_memory} =~s/M*$/M/;

        # make slurm script for job array control
        my $script = $self->{_grid_options}{slurm_dir}.'/slurm_script';
        open(SLURM,'>',$script)||
            die("cannot open $script");
        
        # %A is the job id == $SLURM_ARRAY_JOB_ID
        # %a is the array id == $SLURM_ARRAY_TASK_ID
        
        # do not use SLURM_JOB_ID

        my $rundir = my $dir = dirname(File::Spec->rel2abs(__FILE__));

        # use given slurm_array string, or default to 1-njobs%njobs
        my $slurm_array =
            $self->{_grid_options}{slurm_array} // "1-$njobs\%$njobs";
        
        # use passed in jobid (e.g. for rerun) or default to Slurm's
        my $jobid = $self->{_grid_options}{slurm_jobid} ne '' ? $self->{_grid_options}{slurm_jobid} : '$SLURM_ARRAY_JOB_ID';

        # use passed in jobarrayindex (e.g. for rerun) or default to Slurm's
        my $jobarrayindex = $self->{_grid_options}{slurm_jobarrayindex} ne '' ? $self->{_grid_options}{slurm_jobarrayindex} : '$SLURM_ARRAY_TASK_ID';
            
        my $grid_command = 
            join(" ",
                 '/usr/bin/env',
                 $^X,
                 $self->{_grid_options}{command_line},
                 'run_flexigrid=1',
                 'offset='.$jobarrayindex,
                 'modulo='.$njobs,
                 "results_hash_dumpfile=$self->{_grid_options}{slurm_dir}/results/$jobid.$jobarrayindex",
                 'slurm_jobid='.$jobid,
                 'slurm_jobarrayindex='.$jobarrayindex,
                 'slurm_jobname=binary_grid_'.$jobid.'.'.$jobarrayindex,
                 "slurm_njobs=$njobs",
                 "slurm_dir=$self->{_grid_options}{slurm_dir}",
                 "vb=$self->{_grid_options}{vb}"
            );

        
        print SLURM 
            "#!/bin/bash
# Slurm file for binary_grid2 and slurm
#SBATCH --error=$self->{_grid_options}{slurm_dir}/stderr/\%A.\%a
#SBATCH --output=$self->{_grid_options}{slurm_dir}/stdout/\%A.\%a
#SBATCH --job-name=$self->{_grid_options}{slurm_jobname}
#SBATCH --partition=$self->{_grid_options}{slurm_partition}
#SBATCH --time=$self->{_grid_options}{slurm_time}
#SBATCH --mem=$self->{_grid_options}{slurm_memory}
#SBATCH --ntasks=$self->{_grid_options}{slurm_ntasks}
#SBATCH --array=$slurm_array
		
# set status to \"running\"
echo \"running\" > $self->{_grid_options}{slurm_dir}/status/$jobid.$jobarrayindex

# run grid of stars
$grid_command rungrid=1 slurm_command=run_flexigrid

# set status to \"finished\"
echo \"finished\" > $self->{_grid_options}{slurm_dir}/status/$jobid.$jobarrayindex
";

        # perhaps join on the remote machine
        if(!$self->{_grid_options}->{slurm_postpone_join})
        {
            print SLURM "
# check if we can join
$grid_command rungrid=0 results_hash_dumpfile=$self->{_grid_options}{slurm_dir}/results/$jobid.all slurm_command=join 
";
        }

        close SLURM;
 
        if(!$self->{_grid_options}->{slurm_postpone_sbatch})
        {
            # submit slurm array
            my $cmd="sbatch $script";
            print "Running slurm script \"$cmd\"\n";
            print `$cmd`;
            vb("Scripts submitted : all done");
        }
        else
        {
            print "Slurm script is in \"$script\" but has not been launched\n";
            vb("Slurm script is in \"$script\" but has not been launched\n");
        }
        
        print "All done\n";
        exit;
    }
};


*binary_grid2::checkpoint = sub
{
    # do nothing, yet
    1;
};

*binary_grid2::slurm_workingdir = sub
{
    my $self = shift;

    # set up the working directories 

    # the directory has to be made manually
    if(!defined $self->{_grid_options}{slurm_dir} ||
       !-d $self->{_grid_options}{slurm_dir})
    {
        print "Slurm's working directory (set to $self->{_grid_options}{slurm_dir}) is not a directory. Please make it, or set it to something else, and retry\n";
        exit;
    }

    say "Slurm working directory: $self->{_grid_options}{slurm_dir})";

    # make directory tree
    foreach ('scripts','stdout','stderr','results','logs','status','joining')
    {
	rob_misc::mkdirhier($self->{_grid_options}{slurm_dir}.'/'.$_);
    }

    # wait for the dirs to be made (necessary on NFS)
    say "Waiting for dirs...";
    my $fail = 1;
    while($fail)
    {
        $fail = 0;
        foreach ('scripts','stdout','stderr','results','logs','status','joining')
        {
            if(! -d $self->{_grid_options}{slurm_dir}.'/'.$_)
            {
                sleep 1;
                $fail = 1;
            }
        }
    }

    # return directory name
    return $self->{_grid_options}{slurm_dir};
};








1;

__END__
    
