package binary_grid::condor;

$|=1;
use 5.16.0;
use strict;
use IO::Handle;
use common::sense;
use 5.16.0;
use feature qw(say state);
################################################################
######### >>> condor extensions for binary_grid <<< ############
################################################################
#
# Written by Robert Izzard 
# (r.izzard@surrey.ac.uk or via gmail: rob.izzard) 
# (c) 2000-2019 and onwards into the 21st century.

# version number should match the binary_grid version
our $VERSION = '2.1.5';

############################################################
#
# Changelog
#
# 2.0 Updated to use binary_grid2
# 2.1.5 Add "say state" fudge for perl 5.31.6
#
############################################################

$|=1; # autoflush please (for log output)

# Standard modules:
# if you don't have them then check http://www.cpan.org/ or, better, 
# use cpanm or packages to install them.
use Carp qw(confess);
use Clone qw(clone);
use Config; # for signals
use Fcntl; # for file (un)blocking
use IO::File;
use IPC::Open3;
use Sort::Key qw(nsort);  
use Sys::Hostname;
use Time::HiRes qw(sleep gettimeofday tv_interval);
use File::Basename;
use rob_misc qw(mkdirhier slurp);
use FindBin qw($Bin $Script $RealBin);
use lib "$Bin/../lib";
use IO::Interactive;
use Data::Serializer;
use Data::Serializer::RobJSON;

# module setup
require Exporter;
our @ISA = qw(Exporter);

# Functions to export
my @funcs= qw( &condor_grid &condor_job_hook);

our %EXPORT_TAGS = ( 'all' => [ @funcs ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = @funcs;
use vars qw(@ISA @EXPORT @EXPORT_OK);

# verbose logging
my $vb=1; # 0 = no logging, 1= logging
my $vbmirror=(! IO::Interactive::is_interactive()); # mirror stderr if not a terminal
my @funcs_overridden = qw(
        condor_grid
 condor_workingdir
 make_condor_script
 premake_condor_outfiles
 write_condor_script
 set_condor_job_status
 get_condor_job_status
 check_and_merge_condor_jobs
 check_condor_jobs_done
 merge_condor_jobs
 condor_submit
 jobid
 condor_script_data
 datafiles_string_comma
 datafiles_string
 condor_rerun_command
 condor_check_joining_file
 condor_job_hook
 grid_interrupted
 check_for_saved_snapshot
 pre_load_snapshot
 post_load_snapshot
 checkpoint
 increment_checkpoint_time
 output_allowed
        );
# delete the subroutines we will override
foreach my $subroutine (@funcs_overridden)
{
    print STDERR "Define binary_grid2::condor\-\>$subroutine\n";
    eval "undef \&binary_grid2::$subroutine; "; 
    if($@)
    {
        print $@;
        exit;
    }
}
############################################################
# generic internal functions
############################################################


sub vb
{
    return if($vb==0);

    # set autoflush first time (just in case)
    state $inited=0;
    if($inited==0)
    {
	$inited=1;
	STDOUT->autoflush(1);
	STDERR->autoflush(1);
    }

    # join args to make string, force newline at the end (but just one)
    my $s="@_";
    chomp $s;
    $s .= "\n";

    # output
    print STDOUT 'condor_grid(vb,stdout): ',$s;
    print STDERR 'condor_grid(vb,stdout): ',$s if($vbmirror);
}


############################################################
# condor functions for binary_grid2
############################################################

*binary_grid2::condor_grid = sub
{
    my $self = shift;

    # grid wrapper for condor : should be called exactly like
    # flexigrid() but instead distributes the work to condor jobs
    vb("Bin $Bin : script $Script : RealBin $RealBin : perl executable $^X");

    # disable timeouts and non-stdout/stderr logging
    $self->{_grid_options}{timeout}=0;
    $self->{_grid_options}{thread_presleep}=0;
    $self->{_grid_options}{nfs_sleep}=0;
    $self->{_grid_options}{log_args}=0;
    $self->{_grid_options}{tvb}=0;
    $self->{_grid_options}{no_gridcode_dump}=1;
    $self->{_bse_options}{log_filename}='/dev/null';

    # condor_command is either:
    #
    # empty : this is the controller script, run by the user
    #         on the condor master machine
    #
    # run_flexigrid : in which case the flexigrid is called
    #                 to evolve the stellar population
    #
    #
    # join : this is called to join datafiles saved
    #                  by previous 'run_flexigrid' calls
    #
    # join_datafiles : this should not be used manually 
 
    vb("condor_command is $self->{_grid_options}{condor_command}");

    if($self->{_grid_options}{condor_command} eq 'run_flexigrid')
    {
	# in a job : just run flexigrid on one CPU
	vb("call flexigrid(1)");
        $self->{_grid_options}{nthreads} = 1;
        $self->{_flexigrid}{nthreads} = 1;
	return $self->flexigrid(1);
    }
    elsif($self->{_grid_options}{condor_command} eq 'join_datafiles')
    {
	vb("Joining data");
	# join data : don't run the grid
	$self->{_grid_options}{rungrid}=0;
        $self->{_grid_options}{condor_jobid}='join';
	# but do call flexigrid to join the data
	vb("Calling flexigrid");
	my @x = $self->flexigrid(1);
	print "Flexigrid done, return @x\n";
	return @x;
    }
    elsif($self->{_grid_options}{condor_command} eq 'join')
    {
        vb("Joining data with automatic algorithm");
        # (auto) join data : don't run the grid
	$self->{_grid_options}{rungrid}=0;
        $self->{_grid_options}{condor_jobid}='join';
        $self->{_grid_options}{merge_datafiles_filelist} = 
            $self->{_grid_options}{condor_dir}.'/scripts/joinlist';
	# but do call flexigrid to join the data
	vb("Calling flexigrid");
	my @x = $self->flexigrid(1);
	print "Flexigrid done, return @x\n";
	return @x;
    }
    else
    {
	# something else or no command 
        # We're in the main controller script
	# so we have to set up the condor scripts.
	vb("In main controller thread: set up scripts : condor universe = $self->{_grid_options}{condor_universe}");

	# set up working directory
	vb("Set workingdir()");
	$self->condor_workingdir();

	# how many scripts? default to 16 if nothing is given
	my $njobs = $_[0] // $self->{_grid_options}{condor_njobs} || 16;
	say "Use $njobs condor jobs, universe = $self->{_grid_options}{condor_universe}\n";
	vb("Use $njobs condor jobs");

	# clean up joining files
	vb("Unlink joining files");
	unlink($self->{_grid_options}{condor_dir}.'/status/joining');
	
	# make scripts
	vb("Make condor scripts");

        # make join script
        my $cmd = $self->condor_rerun_command();
        my $pwd = $self->{_grid_options}{condor_join_pwd} // $ENV{PWD}; 
        $cmd = "cd $pwd \&\& $cmd";
        
        # output the join command in case the join
        # is postponed
        my $f= $self->{_grid_options}{condor_dir}.'/scripts/joincmd';
        open(my $fp,'>'.$f)||confess("cannot open $f for writing");
        say {$fp} "# join command:\n";
        say {$fp} $cmd;
        close $fp;
        chmod 0700, $f;

        # make individual scripts
	foreach my $n (1..$njobs)
	{
	    my $opts={
		n=>$n,
		njobs=>$njobs,
		dir=>$self->{_grid_options}{condor_dir},
		cmdline=>$self->{_grid_options}{command_line},
		wd=>$self->{_grid_options}{working_directory},
		memory=>$self->{_grid_options}{condor_memory},
		streams=>$self->{_grid_options}{condor_streams},
	    };
	    
	    my $scriptfile = $self->make_condor_script($opts);

	    $self->premake_condor_outfiles($opts);
	    
            # check existing jobs : if 'finished' then skip it 
            my $status = $self->get_condor_job_status($opts);

	    # todo port this to binary_grid
	    
	    # decide whether to submit
	    my $submit =
		(!defined $status || $status eq '') ||
		($status eq 'finished' && $self->{_grid_options}{condor_resubmit_finished}) ||
		($status eq 'submitted' && $self->{_grid_options}{condor_resubmit_submitted}) ||
		($status eq 'running' && $self->{_grid_options}{condor_resubmit_running}) ||
		($status =~/crash/ && $self->{_grid_options}{condor_resubmit_crashed});
            
            print "Current status is $status : submit = \"$submit\" (resubmit is: submitted=$self->{_grid_options}{condor_resubmit_submitted} running=$self->{_grid_options}{condor_resubmit_running} finished=$self->{_grid_options}{condor_resubmit_finished} crashed=$self->{_grid_options}{condor_resubmit_crashed})\n\n";
	    if($submit)
	    {
		# made script, submit it
		print "made script $n : submit\n";
		$self->condor_submit($opts,$scriptfile);
	    }
	    else
	    {
		print "Job $n has status $status but is being skipped\n";
	    }
        }

	vb("Scripts submitted : all done");
	exit;
    }
};


*binary_grid2::condor_workingdir = sub
{
    my $self = shift;

    # define a working directory for condor : this is
    $self->{_grid_options}{condor_dir} ||=    
	($self->{_grid_options}{tmp}//'/tmp').'/condor/'.(rand()=~/\d\.(\d+)/)[0];

    say "Condor working directory: $self->{_grid_options}{condor_dir})";

    # make directory tree
    mkdirhier($self->{_grid_options}{condor_dir});
    foreach ('scripts','stdout','stderr','results','logs','status')
    {
	my $d = $self->{_grid_options}{condor_dir}.'/'.$_;
	mkdirhier($d);
	if(!-d $d)
	{
	    die("Directory $d does not exist, and I just tried to make it");
	}
	else
	{
	    vb("Made directory $d\n");
	}
    }

    # wait for the dirs to be made (necessary on NFS)
    {
	say "Waiting for dirs...";
	while(!(-d $self->{_grid_options}{condor_dir}.'/status'))
	{
	    sleep 1;
	}
    }

    return $self->{_grid_options}{condor_dir};
};

*binary_grid2::make_condor_script = sub
{
    my $self = shift;
    say "Make condor_script $_[0]{n}, universe $self->{_grid_options}{condor_universe}";
    $self->write_condor_script($self->condor_script_data(@_),@_);
};

*binary_grid2::premake_condor_outfiles = sub
{
    my $self = shift;
    my $id = $self->jobid($_[0]);
    foreach my $x ('stdout','stderr','logs')
    {
	rob_misc::touch("$_[0]{dir}/$x/$id");
    }
};

*binary_grid2::write_condor_script = sub
{
    my $self = shift;
    # write condor script to a file
    my $scriptdata=shift @_; # data is always first arg
    my $jobid = jobid($_[0]);
    my $scriptfile=$self->{_grid_options}{condor_dir}.'/scripts/'.$jobid;
    
    open(my $fp,'>'.$scriptfile)||
        confess("can't open condor script $scriptfile for writing");
    print {$fp} $scriptdata;
    close $fp;
    chmod 0744,$scriptfile;
    say "Script written to $scriptfile";

    # make the wrapper script
    my $wrapfile = $scriptfile.'.wrap';
    my $statusfile=$self->{_grid_options}{condor_dir}.'/status/'.$jobid;

    open(my $fp,'>',$wrapfile)||
        confess("cannot open $wrapfile for writing");
    say {$fp} "#!/bin/bash

echo \"running\" > $statusfile

function _term {
    echo \"Caught SIGTERM kill PID \$PID\"
    kill -TERM \$PID
    kill -KILL \$PID
    echo \"crashed 15\" > $statusfile
    exit
}
trap _term SIGTERM
function _int {
    echo \"Caught SIGINT kill PID \$PID\"
    kill -INT \$PID
    kill -TERM \$PID
    kill -KILL \$PID
    echo \"crashed 2\" > $statusfile
    exit
}
trap _int SIGINT

CMD=\"\$\@\"
\$CMD \&
PID=\$\!

wait \$PID
trap - SIGTERM SIGINT
wait \$PID

status=\$\(echo \$?\)
if \[\[ \$status -eq 0 \]\]; then
  echo \"finished\" > $statusfile
  exit 0
else
  echo \"crashed \$status\" > $statusfile
  exit \$status  
fi

";
    close $fp;
    chmod 0744, $wrapfile;
    say "Wrapper written to $wrapfile\n";


    return $scriptfile;
};

*binary_grid2::set_condor_job_status = sub
{
    my $self = shift;
    # set condor job status file (args=jobid,status string)
    my $jobid=$_[0];
    my $status=$_[1];
    my $statusfile=$self->{_grid_options}{condor_dir}.'/status/'.$jobid;
    open(my $fp,'>'.$statusfile)||confess("cannot open status file $statusfile for writing");
    say {$fp} $status;
    close $fp;
};

*binary_grid2::get_condor_job_status = sub
{
    my $self = shift;
     # set condor job status file (args=opts hash)
    my $opts = $_[0];
    my $jobid = jobid($opts);
    my $statusfile=$self->{_grid_options}{condor_dir}.'/status/'.$jobid;
    if(open(my $fp,'<'.$statusfile))
    {
        my $status = <$fp>;
        close $fp;
        chomp $status;
        return $status;
    }
    else
    {
        return '';
    }
};

*binary_grid2::check_and_merge_condor_jobs = sub
{
    my $self = shift;
    # if all condor jobs have finished, merge the data and return it,
    # otherwise return undef
    if($self->check_condor_jobs_done(@_))
    {
	return ($self->merge_condor_jobs(@_)); 
    }
    else
    {
	return (undef,undef);
    }
};

*binary_grid2::check_condor_jobs_done = sub
{
    my $self = shift;
    # check that all jobs in $path are finished, in which case return 1
    # otherwise return 0
    my $path=$_[0];
    my $njobs=$_[1];
    foreach (1..$njobs)
    {
	my $jobid=$_.'.'.$njobs;
	my $f="$path/status/$jobid";
	if(open(my $fp,'<'.$f))
	{ 
	    if(!(<$fp> =~ /finished/))
	    {
		say "Job $jobid has not finished\n";
		return 0;
	    }
	    else
	    {
		say "Job $jobid has finished\n";
	    }
	}
	else
	{
	    print "WARNING : Probable error : could not find status file $f - it should at least exist!\n"; 
	    vb("WARNING : Probable error : could not find status file $f - it should at least exist!\n"); 
	    return 0;
	}
    }

    say "All condor jobs have finished\n";

    # no error, all jobs are finished : return 1
    return 1;
};

*binary_grid2::merge_condor_jobs = sub
{
    my $self = shift;
    # merge all job data in $path
    my $path=$_[0];
    my $njobs=$_[1];
    
    # make file list
    my @f;
    foreach (1..$njobs)
    {
	my $jobid=$_.'.'.$njobs;
	push(@f,"$path/results/$jobid");
    }

    # save 

    # save data to file
    my $f="$path/results/joined";
    $self->dump_results_hash($f);

    # return data and filename
    return ($self->merge_results_hash_dumps(@_),$f);
};

*binary_grid2::condor_submit = sub
{
    my $self = shift;
    # submit a condor script
    my $opts=$_[0];
    my $scriptfile=$_[1];
    say "condor_submit: $scriptfile (size ",-s $scriptfile,")\n";
    `condor_submit $scriptfile`;
    $self->set_condor_job_status($self->jobid($opts),
                                 'submitted');
};

*binary_grid2::jobid = sub
{
    my $self = shift;
    # job identifier number : usually m.n where
    # m is the job number and n the number of jobs in this group
    return $_[0]{n}.'.'.$_[0]{njobs};
};

*binary_grid2::condor_script_data = sub
{
    my $self = shift;
    # return condor script data

    my $universe = $self->{_grid_options}{condor_universe};
    say "Make condor script data, universe = $universe";

    # the perl script
    my $script=$_[0]{cmdline};
    $script=~s/condor_njobs=\d+//g;
    $script=~s/\s+/ /g; $script=~s/\s+$//;

    # job id
    my $id=jobid($_[0]);

    # set up environment
    my @environment;
    foreach ('PATH','PERL5LIB',(grep {/PERLBREW/} keys %ENV))
    {
	push(@environment, $_.'='.$ENV{$_}) if(defined($ENV{$_}));
    }

    my $environment = join(' ',@environment);

    $self->{_grid_options}{condor_requirements} = undef 
	if($self->{_grid_options}{condor_requirements} eq '');
    
    $self->{_grid_options}{condor_requirements} //= '1';
        
    my $stream = $_[0]{streams} ? 'True' : 'False';

    my $s= "
  #################################################
  #                       
  # Condor script to run a binary_grid2 (sub)grid
  #
  # script $_[0]{n} of $_[0]{njobs}
  #                   
  #################################################


  executable     = $self->{_grid_options}{condor_dir}/scripts/$id.wrap
  arguments      = /usr/bin/env PWD=$ENV{PWD} $^X $script run_flexigrid=1 offset=".($_[0]{n}-1)." modulo=$_[0]{njobs} rungrid=1 results_hash_dumpfile=$_[0]{dir}/results/$id condor_command=run_flexigrid condor_universe=$self->{_grid_options}{condor_universe} condor_jobid=$id condor_njobs=$_[0]{njobs} condor_dir=$_[0]{dir} vb=$self->{_grid_options}{vb}
  environment    = \"$environment\"
  universe       = $universe
  output         = $_[0]{dir}/stdout/$id
  error          = $_[0]{dir}/stderr/$id
  log            = $_[0]{dir}/logs/$id
  initialdir     = $_[0]{wd}
  remote_initialdir     = $_[0]{wd}
  stream_output  = $stream
  stream_error   = $stream
+WantCheckpoint = False

  request_memory = $_[0]{memory}
  ImageSize = $_[0]{memory}

  Requirements = (1) \&\& (".
  $self->{_grid_options}{condor_requirements}.")\n";

    # add other user-defined options
    map
    {
	$s .= '  '.$_.' = '.$self->{_grid_options}{condor_options}{$_}."\n";
    }grep {!/[Re]quirements/} keys %{$self->{_grid_options}{condor_options}};
    
    # optional condor parameters are given as condor_* in grid_options 
    foreach my $opt (grep {defined($self->{_grid_options}{'condor_'.$_})}
		     ('request_memory')
	)
    {
	$s .= "  $opt     = ".$self->{_grid_options}{'condor_'.$_}."\n";
    }
    
    $s.= "

    queue                         
";

    return $s;
};

*binary_grid2::datafiles_string_comma = sub
{
    my $self = shift;
    # make a string of the datafiles, comma-separated
    my @f;
    foreach (1..$self->{_grid_options}{condor_njobs})
    {
	my $jobid=$_.'.'.$self->{_grid_options}{condor_njobs};
	push(@f,$self->{_grid_options}{condor_dir}.'/results/'.$jobid);
    }
    return join(',',@f);
};

*binary_grid2::datafiles_string = sub
{
    my $self = shift;
    # make a string of the datafiles, comma-separated
    my @f;
    foreach (1..$self->{_grid_options}{condor_njobs})
    {
	my $jobid=$_.'.'.$self->{_grid_options}{condor_njobs};
	push(@f,$self->{_grid_options}{condor_dir}.'/results/'.$jobid);
    }
    return join("\n",@f);
};

*binary_grid2::condor_rerun_command = sub
{
    my $self = shift;
    # make condor rerun command and list of datafiles to be rerun
    say STDERR "make condor rerun command : $self\n";
    my $cmd = $self->{_grid_options}{command_line};

    # remove unwanted arguments
    $cmd=~s/condor_jobid\S+//o;
    $cmd=~s/condor_njobs\S+//o;
    $cmd=~s/rungrid=1//o;
    $cmd=~s/run_flexigrid=1//o;
    $cmd=~s/offset=\d+//o;
    $cmd=~s/modulo=\d+//o;
    $cmd=~s/results_hash_dumpfile=\S+//o;
    $cmd=~s/condor_command=\S+//o;

    my $list_filename = $self->{_grid_options}{condor_dir}.'/scripts/joinlist';
    {
	# output list of datafiles
	open(my $fp,'>',$list_filename) || 
            confess("cannot open $list_filename to make list of datafiles that need joining");
	say {$fp} $self->datafiles_string();
	close $fp;
    }

    $cmd=join(' ',
	      $cmd,
	      'rungrid=0',
	      'merge_datafiles_filelist='.$list_filename,
	      
	      'condor_command=join_datafiles',
	      '2>'.$self->{_grid_options}{condor_dir}.'/stderr/join',
	      '>'.$self->{_grid_options}{condor_dir}.'/stdout/join',
	      'condor_jobid=join',
	);

    $cmd .= "results_hash_dumpfile=$self->{_grid_options}{condor_dir}/results/joined"
	if($self->{_grid_options}{condor_save_joined_file});
    
    $cmd=~s/\s+/ /go;

    {
	# output command script for rerun
	my $joinscript = $self->{_grid_options}{condor_dir}.'/scripts/join';
	open(my $fp,'>',$joinscript) || 
	    confess("cannot open joinscript $joinscript ");
        say {$fp} "\#!/bin/sh
# Script to join binary_grid2 simulation with parameters: 
#";
        # show options
        foreach my $hash ('_grid_options',
                          '_bse_options')
        {
            my $d = Data::Dumper->Dump([$self->{$hash}]);
            $d=~s/\n/\n\#/g;
            $d=~s/^/\#/;
            say {$fp} $d;
        }
        say {$fp} "############################################################\n$cmd";
     	close $fp;
	
        chmod 0755, $joinscript;
	print "Join script which you should execute: \n$joinscript\n";
    }

    return $cmd;
};

*binary_grid2::condor_check_joining_file = sub
{
    my $self = shift;
    # check for joining file : 
    my $joining_file = $self->{_grid_options}{condor_dir}.'/status/joining';
    if(-f $joining_file)
    {
	# joining file exists : a different process is joining
	say 'Some other process (PID '.slurp($joining_file).' is joining : I shall not';
	exit;
    }
    else
    {
	# touch the "joining" file
	open(my $touch,'>'.$self->{_grid_options}{condor_dir}.'/status/joining');
	print $touch $$;
	close $touch;
    }

    sleep $self->{_grid_options}{nfs_sleep}; # wait for the touch to take effect (prevent NFS issues)

    # check joining file has our PID
    if(-f $self->{_grid_options}{condor_dir}.'/status/joining')
    {
	# joining file exists : is it ours?
	open(my $fp,'<'.$self->{_grid_options}{condor_dir}.'/status/joining');
	my $n=<$fp>;
	chomp $n;
	if($n!=$$)
	{
	    say "Joining file belongs to a different process : I am doing nothing\n";
	    exit;
	}
    }
    else
    {
	confess("joining file has gone missing!\n");
    }
    return $self->condor_rerun_command();
};

*binary_grid2::condor_job_hook = sub
{
    my $self = shift;
    
    #
    # subroutine called after a condor-binary_grid2 job has finished 
    #
    # find job id
    my $jobid = $self->{_grid_options}{condor_jobid};

    # set job status to finished
    $self->set_condor_job_status($jobid,'finished');
    
    # save parent pid and fork child process
    my $parent_pid = $$;
    my $child_pid = fork();

    if($child_pid!=0)
    {
	# parent process
	say "condor_check : Parent process $$";
	
	# free memory (no longer required, just annoying waste of space)
	$self->{_grid_options}{results_hash} = undef;

	# wait for the child to finish
	say "Parent: waitpid for child : $child_pid"; 
	my $r = waitpid $child_pid,0;
	
	say "Parent : waitpid returned $r\n";

	if($self->{_grid_options}{condor_postpone_join})
	{
	    print "Join postponed\n";
	}
	else
	{
	    print "Check whether we can join now (not postponed)\n";
	    say "Parent: call check_condor_jobs_done";

	    if($self->check_condor_jobs_done($self->{_grid_options}{condor_dir},
					     $self->{_grid_options}{condor_njobs}))
	    {
		my $cmd = $self->condor_rerun_command();

		# check if the joining file exists, if yes: exits (from condor.pm)
		# if no, returns the command to rerun the grid to join all the data
		$self->condor_check_joining_file();

		say "Join condor data sets with exec of:\n\n############################################################\n\n$cmd\n\n\n";
		# join in appropriate directory
		my $pwd = $self->{_grid_options}{condor_join_pwd} // $ENV{PWD}; 
		$cmd = "cd $pwd \&\& $cmd";
		
		# join on another machine if required
		if(defined $self->{_grid_options}{condor_join_machine})
		{
		    $cmd = "ssh $self->{_grid_options}{condor_join_machine} \"$cmd\"";
		}
		
		`$cmd`
	    }
	    else
	    {
		say 'Parent : condor jobs not finished';
	    }
	}
	say 'Parent: exit';
	
	# exit : do not even return!
	exit;
    }
    else
    {
	# child process: just return and finish off as per usual
	say "condor_check : Child process $$ continuing until output() and end.";
	return;
    }
};

############################################################
############################################################

*binary_grid2::grid_interrupted = sub
{
    my $self = shift;
    my $jobid = $self->{_grid_options}{condor_jobid};
    my $path = $self->{_grid_options}{condor_dir};
    
    # SIGKILL caught, probably from condor_master
    print "grid_interrupted(): Caught SIGKILL at ",scalar localtime(),"\n";

    if($self->{_grid_options}{flexigrid_finished})
    {
	print "Grid has finished : wait for it rather than attempt a snapshot\n";
    }
    elsif($self->{_grid_options}{condor_snapshot_on_kill})
    { 
	# snapshot the grid and exit
	my $f = "$path/results/$jobid.sv";
	
	$self->{_grid_options}{snapshot_file} = $f; 
	print "Sending save_snapshot action to $f at ",scalar localtime(),"\n";
	
	$self->{_flexigrid}{actions}{save_snapshot} = 1;
	
	print "Sending quit action at ",scalar localtime(),"\n";
	$self->{_flexigrid}{actions}{quit} = 1;
    }
    else
    {
	# what to do? Grid has not finished, but we want to shut down.
	# Try to do so gracefully.
	print "Sending quit action at ",scalar localtime(),"\n";
	$self->{_flexigrid}{actions}{quit}=1;
    }

    return 0;
};

*binary_grid2::check_for_saved_snapshot = sub
{
    my $self = shift;
    return undef if(!$self->{_grid_options}{condor_load_from_snapshot});

    # return 1 if a saved snapshot is found 
    my $jobid = $self->{_grid_options}{condor_jobid};
    my $path = $self->{_grid_options}{condor_dir};

    # first check for restart file, then checkpoint
    # because the restart is likely to be newer
    foreach my $file ("$path/results/$jobid.sv",
		      "$path/results/$jobid.ck")
    {
	# NB can be a symlink
	return $file if(-e -s $file);
    }
    return undef; # else return undef
};

*binary_grid2::pre_load_snapshot = sub
{
    my $self = shift;
    return undef if(!$self->{_grid_options}{condor_load_from_snapshot});

    # return the name of the snapshot file to be loaded
    my $jobid = $self->{_grid_options}{condor_jobid};
    my $path = $self->{_grid_options}{condor_dir};

    # first check for restart file, then checkpoint
    # because the restart is likely to be newer
    foreach my $file ("$path/results/$jobid.sv",
		      "$path/results/$jobid.ck",
	)
    {
	print "Return snapshot filename = $file at ",scalar localtime(),"\n";
	return $file;
    }
};

*binary_grid2::post_load_snapshot = sub
{
    my $self = shift;
    # post snapshot load hook : remove old file
    my $file = $_[0];
    return if(!defined $file);

    my $jobid = $self->{_grid_options}{condor_jobid};
    my $path = $self->{_grid_options}{condor_dir};
    
    if($file =~/\.sv$/)
    {
	print "Snapshot loaded : unlink $file at ",scalar localtime(),"\n";
	unlink $file;
    }
    elsif($file=~/.ck$/)
    {
	print "Checkpoint loaded : $file remains\n";

	# don't allow an immediate checkpoint
    	$self->increment_checkpoint_time();
    }
    else
    {
	print "Snapshot loaded : $file not unlinked because it is a checkpoint\n"; 
    }
};

*binary_grid2::checkpoint = sub
{
    my $self = shift;
    return if(!$self->{_grid_options}{condor_checkpoint_interval});

    # first time : set next checkpoint time
    if(! defined $self->{_grid_options}{condor_next_checkpoint})
    {
	$self->increment_checkpoint_time();
    	print "Set first checkpoint time at $self->{_grid_options}{condor_next_checkpoint}\n";
    }

    # other times, check if we should save a checkpoint
    elsif(defined $self->{_grid_options}{condor_next_checkpoint} &&
          $self->{_grid_options}{condor_checkpoint_interval} &&
          time() > $self->{_grid_options}{condor_next_checkpoint})
    {
	my $jobid = $self->{_grid_options}{condor_jobid};
	my $path = $self->{_grid_options}{condor_dir};
	
	# timed checkpoint file
	my @time = localtime();

	# decide file name : if condor_checkpoint_stamp_times is set
	# then they are timestamped, otherwise just use $jobid.ck 
	my $f = "$path/results/$jobid.ck" .
	    ($self->{_grid_options}{condor_checkpoint_stamp_times} ?
	     sprintf '-%02d%02d%04d-%02d%02d.%02d',
	     $time[3],$time[4],$time[5]+1900,
	     $time[2],$time[1],$time[0] : ''); 

	$self->{_grid_options}{snapshot_file} = $f; 
	print "checkpoint(): Sending save_snapshot action to $f at ",scalar localtime(),"\n";

	# do the save and relaunch threads
	$self->{_flexigrid}{actions}{save_snapshot}=1;
	$self->{_flexigrid}{actions}{relaunch_threads}=1;
	
	# update next time
	$self->increment_checkpoint_time();
	print "Set next checkpoint time at $self->{_grid_options}{condor_next_checkpoint}\n";

	# make symlink for easy restart if required
	if($self->{_grid_options}{condor_checkpoint_stamp_times})
	{
	    my $sym = "$path/results/$jobid.ck";
	    if(-f $sym)
	    {
		say "Warning : cannot make symlink at $sym because it is an existing file";
	    }
	    else
	    {
		unlink $sym if(-s $sym); # remove existing symlink
		symlink $f, $sym; # make new one
	    }
	}
    }
};

*binary_grid2::increment_checkpoint_time = sub
{
    my $self = shift;
    $self->{_grid_options}{condor_next_checkpoint} = 
	time() + $self->{_grid_options}{condor_checkpoint_interval};
};



*binary_grid2::output_allowed = sub
{
    # return 1 if we're allowed to output
    my $self = shift;
    #print "Allowed? $self->{_grid_options}{condor}, $self->{_grid_options}{condor_command}\n";
    if($self->{_grid_options}{condor})
    {
	if($self->{_grid_options}{condor_command} eq 'join_datafiles' ||
           $self->{_grid_options}{condor_command} eq 'join')
	{
	    return 1;
	}
	else
	{
	    return 0;
	}
    }
    else
    {
	return 1;
    }
};




######################################################################
# The following are binary_grid::condor functions only
#
# They are not guaranteed to work: please use binary_grid2::condor 
######################################################################



sub condor_grid
{
    # grid wrapper for condor : should be called exactly like
    # flexigrid() but instead distributes the work to condor jobs
    vb("Bin $Bin : script $Script : RealBin $RealBin : perl executable $^X");

    # disable timeouts and non-stdout/stderr logging
    $binary_grid::grid_options{timeout}=0;
    $binary_grid::grid_options{thread_presleep}=0;
    $binary_grid::grid_options{nfs_sleep}=0;
    $binary_grid::grid_options{log_args}=0;
    $binary_grid::grid_options{tvb}=0;
    $binary_grid::grid_options{no_gridcode_dump}=1;
    
    # evolution-code specific options 
    if($binary_grid::grid_options{code} eq 'binary_c')
    {
	$binary_grid::bse_options{log_filename}='/dev/null'
    }
    elsif($binary_grid::grid_options{code} eq 'bonnfires')
    {
	delete $binary_grid::bse_options{log_filename};
    }

    print "LOG FILENAME $binary_grid::bse_options{log_filename}\n";
 
    vb("condor_command is $binary_grid::grid_options{condor_command}");

    if($binary_grid::grid_options{condor_command} eq 'run_flexigrid')
    {
	# in a job : just run flexigrid on one CPU
	vb("call flexigrid(1)");
	return binary_grid::flexigrid(1);
    }
    elsif($binary_grid::grid_options{condor_command} eq 'join_datafiles')
    {
	vb("Joining data");
	# join data : don't run the grid
	$binary_grid::grid_options{rungrid}=0;
	# but do call flexigrid to join the data
	vb("Calling flexigrid");
	my @x=binary_grid::flexigrid(1);
	print "Flexigrid done, return @x\n";
	return @x;
    }
    elsif($binary_grid::grid_options{condor_command} eq 'join')
    {
	vb("Joining data");
	# (auto) join data : don't run the grid
	$binary_grid::grid_options{rungrid}=0;
        $binary_grid::grid_options{merge_datafiles_filelist} = 
            $binary_grid::grid_options{condor_dir}.'/scripts/joinlist';
	# but do call flexigrid to join the data
	vb("Calling flexigrid");
	my @x=binary_grid::flexigrid(1);
	print "Flexigrid done, return @x\n";
	return @x;
    }
    else
    {
	# no condor command : we're in the main controller script
	# so we have to set up the condor scripts
	vb("In main controller thread: set up scripts");

	# set up working directory
	vb("Set workingdir()");
	condor_workingdir();

	# how many scripts? default to 24 (max!) if nothing is given
	my $njobs=$_[0] // $binary_grid::grid_options{condor_njobs} || 16;
	say "Use $njobs condor jobs, in universe $binary_grid::grid_options{condor_universe} \n";
	vb("Use $njobs condor jobs");

	# clean up joining files
	vb("Unlink joining files");
	unlink($binary_grid::grid_options{condor_dir}.'/status/joining');
	
	# make scripts
	vb("Make condor scripts");
	foreach my $n (1..$njobs)
	{
	    my $opts={
		n=>$n,
		njobs=>$njobs,
		dir=>$binary_grid::grid_options{condor_dir},
		cmdline=>$binary_grid::grid_options{command_line},
		wd=>$binary_grid::grid_options{working_directory},
		memory=>$binary_grid::grid_options{condor_memory},
		streams=>$binary_grid::grid_options{condor_streams},

	    };
	    
	    my $scriptfile=make_condor_script($opts);

	    premake_condor_outfiles($opts);

            # check existing jobs : if 'finished' then skip it 
            my $status = get_condor_job_status($opts);

	    # decide whether to submit
	    my $submit =
		(!defined $status || $status eq '') ||
		($status eq 'finished' && $binary_grid::grid_options{condor_resubmit_finished}) ||
		($status eq 'submitted' && $binary_grid::grid_options{condor_resubmit_submitted}) ||
		($status eq 'running' && $binary_grid::grid_options{condor_resubmit_running}) ||
		($status =~/crash/ && $binary_grid::grid_options{condor_resubmit_crashed});

	    print "Current status is $status : submit = \"$submit\" (resubmit is: submitted=$binary_grid::grid_options{condor_resubmit_submitted} running=$binary_grid::grid_options{condor_resubmit_running} finished=$binary_grid::grid_options{condor_resubmit_finished} crashed=$binary_grid::grid_options{condor_resubmit_crashed})\n\n";
	    if($submit)
	    {
		vb("made script $n : submit");
                condor_submit($opts,$scriptfile);
	    }
	    else
	    {
		print "Job $n has status $status\n";
	    }
	}
	vb("Scripts submitted : all done");
	exit;
    }
}


sub condor_workingdir
{
    # define a working directory for condor : this is
    $binary_grid::grid_options{condor_dir} =    
	$binary_grid::grid_options{condor_dir} ||
	($binary_grid::grid_options{tmp}//'/tmp').'/condor/'.(rand()=~/\d\.(\d+)/)[0];

    say "Condor working directory: $binary_grid::grid_options{condor_dir})";

    # make directory tree
    mkdirhier($binary_grid::grid_options{condor_dir});
    foreach ('scripts','stdout','stderr','results','logs','status')
    {
	mkdirhier($binary_grid::grid_options{condor_dir}.'/'.$_);
    }

    return $binary_grid::grid_options{condor_dir};
}

sub make_condor_script
{
    say "Make condor_script $_[0]{n}";
    write_condor_script(condor_script_data(@_),@_);
}

sub premake_condor_outfiles
{
    my $id=jobid($_[0]);
    foreach my $x ('stdout','stderr','logs')
    {
	rob_misc::touch("$_[0]{dir}/$x/$id");
    }
}

sub write_condor_script
{
    # write condor script to a file
    my $scriptdata=shift @_; # data is always first arg
    my $jobid = jobid($_[0]);
    my $scriptfile=$binary_grid::grid_options{condor_dir}.'/scripts/'.$jobid;
    open(my $fp,'>'.$scriptfile)||confess("can't open condor script $scriptfile for writing");
    print {$fp} $scriptdata;
    close $fp;
    say "Script written to $scriptfile (size ",(-s $scriptfile),")";


    # make the wrapper script
    my $wrapfile = $scriptfile.'.wrap';
    my $statusfile = $binary_grid::grid_options{condor_dir}.'/status/'.$jobid;
    open(my $fp,'>',$wrapfile)||
        confess("cannot open $wrapfile for writing");
    say {$fp} "#!/bin/bash

echo \"running\" > $statusfile
\"\$\@\"

status=\$\(echo \$?\)
if \[\[ \$status -eq 0 \]\]; then
  echo \"finished\" > $statusfile
  exit 0
else
  echo \"crashed \$status\" > $statusfile
  exit \$status
fi
";
    close $fp;
    say "Wrapper written to $wrapfile\n";

    return $scriptfile;
}

sub set_condor_job_status
{
    # set condor job status file (args=jobid,status string)
    my $jobid=$_[0];
    my $status=$_[1];
    my $statusfile=$binary_grid::grid_options{condor_dir}.'/status/'.$jobid;
    open(my $fp,'>'.$statusfile)||confess("cannot open status file $statusfile for writing");
    say {$fp} $status;
    close $fp;
}

sub get_condor_job_status
{
     # set condor job status file (args=opts hash)
    my $opts = $_[0];
    my $jobid = jobid($opts);
    my $statusfile=$binary_grid::grid_options{condor_dir}.'/status/'.$jobid;
    open(my $fp,'<'.$statusfile)||confess("cannot open status file $statusfile for reading");
    my $status = <$fp>;
    close $fp;
    chomp $status;
    return $status;
}


sub check_and_merge_condor_jobs
{
    # if all condor jobs have finished, merge the data and return it,
    # otherwise return undef
    if(check_condor_jobs_done(@_))
    {
	return (merge_condor_jobs(@_)); 
    }
    else
    {
	return (undef,undef);
    }
}

sub check_condor_jobs_done
{
    # check that all jobs in $path are finished, in which case return 1
    # otherwise return 0
    my $path=$_[0];
    my $njobs=$_[1];
    foreach (1..$njobs)
    {
	my $jobid=$_.'.'.$njobs;
	my $f="$path/status/$jobid";
	if(open(my $fp,'<'.$f))
	{ 
	    if(!(<$fp> =~ /finished/))
	    {
		say "Job $jobid has not finished\n";
		return 0;
	    }
	    else
	    {
		say "Job $jobid has finished\n";
	    }
	}
	else
	{
	    print "WARNING : Probable error : could not find status file $f - it should at least exist!\n"; 
	    vb("WARNING : Probable error : could not find status file $f - it should at least exist!\n"); 
	    return 0;
	}
    }

    say "All condor jobs have finished\n";

    # no error, all jobs are finished : return 1
    return 1;
}

sub merge_condor_jobs
{
    # merge all job data in $path
    my $path=$_[0];
    my $njobs=$_[1];
    
    # make file list
    my @f;
    foreach (1..$njobs)
    {
	my $jobid=$_.'.'.$njobs;
	push(@f,"$path/results/$jobid");
    }

    # save 

    # save data to file
    my $f="$path/results/joined";
    binary_grid::dump_results_hash($f);

    # return data and filename
    return (binary_grid::merge_results_hash_dumps(@_),$f);
}


sub condor_submit
{
    # submit a condor script
    my $opts=$_[0];
    my $scriptfile=$_[1];
    say "condor_submit: $scriptfile (size ",-s $scriptfile,")\n";
    `condor_submit $scriptfile`;
    set_condor_job_status(jobid($opts),
                          'submitted');
}

sub jobid
{
    # job identifier number : usually m.n where
    # m is the job number and n the number of jobs in this group
    return $_[0]{n}.'.'.$_[0]{njobs};
}

sub condor_script_data
{
    # return condor script data

    # the perl script
    my $script=$_[0]{cmdline};
    $script=~s/condor_njobs=\d+//g;
    $script=~s/\s+/ /g; $script=~s/\s+$//;

    # job id
    my $id=jobid($_[0]);

    # set up environment
    my @environment;
    foreach ('PATH','PERL5LIB',(grep {/PERLBREW/} keys %ENV))
    {
	push(@environment, $_.'='.$ENV{$_}) if(defined($ENV{$_}));
    }

    my $environment = join(' ',@environment);

    $binary_grid::grid_options{condor_requirements} //= '1';

    my $stream = $_[0]{streams} ? 'True' : 'False';

    my $s= "
  #################################################
  #                       
  # Condor script to run a binary_grid flexigrid
  #
  # script $_[0]{n} of $_[0]{njobs}
  #                   
  #################################################


  executable     = /usr/bin/env
#/usr/bin/cgexec 
#  arguments      = -g cpu:Background:izzard $^X 
  arguments      = PWD=$ENV{PWD} $^X $script run_flexigrid=1 offset=".($_[0]{n}-1)." modulo=$_[0]{njobs} rungrid=1 results_hash_dumpfile=$_[0]{dir}/results/$id condor_command=run_flexigrid condor_jobid=$id condor_njobs=$_[0]{njobs} condor_dir=$_[0]{dir} vb=$binary_grid::grid_options{vb}
  environment    = \"$environment\"
  universe       = $binary_grid::grid_options{condor_universe}
  output         = $_[0]{dir}/stdout/$id
  error          = $_[0]{dir}/stderr/$id
  log            = $_[0]{dir}/logs/$id
  initialdir     = $_[0]{wd}
  remote_initialdir     = $_[0]{wd}
  stream_output  = $stream
  stream_error   = $stream
+WantCheckpoint = False

  request_memory = $_[0]{memory}
  ImageSize = $_[0]{memory}

  Requirements = (1) \&\& (".
  $binary_grid::grid_options{condor_requirements}.")\n";

    # add other user-defined options
    map
    {
	$s .= '  '.$_.' = '.$binary_grid::grid_options{condor_options}{$_}."\n";
    }grep {!/[Re]quirements/} keys %{$binary_grid::grid_options{condor_options}};
    
    # optional condor parameters are given as condor_* in grid_options 
    foreach my $opt (grep {defined($binary_grid::grid_options{'condor_'.$_})}
		     ('request_memory')
	)
    {
	$s .= "  $opt     = ".$binary_grid::grid_options{'condor_'.$_}."\n";
    }
    
    $s.= "

    queue                         
";
    return $s;
}

sub datafiles_string_comma
{
    # make a string of the datafiles, comma-separated
    my @f;
    foreach (1..$binary_grid::grid_options{condor_njobs})
    {
	my $jobid=$_.'.'.$binary_grid::grid_options{condor_njobs};
	push(@f,$binary_grid::grid_options{condor_dir}.'/results/'.$jobid);
    }
    return join(',',@f);
}

sub datafiles_string
{

    # make a string of the datafiles, comma-separated
    my @f;
    foreach (1..$binary_grid::grid_options{condor_njobs})
    {
	my $jobid=$_.'.'.$binary_grid::grid_options{condor_njobs};
	push(@f,$binary_grid::grid_options{condor_dir}.'/results/'.$jobid);
    }
    return join("\n",@f);
}

sub condor_rerun_command
{
    # make condor rerun command and list of datafiles to be rerun
    say "Parent: condor jobs are done, call grid again to join datafiles";
    my $cmd = $binary_grid::grid_options{command_line};

    # remove unwanted arguments
    $cmd=~s/condor_jobid\S+//o;
    $cmd=~s/condor_njobs\S+//o;
    $cmd=~s/rungrid=1//o;
    $cmd=~s/run_flexigrid=1//o;
    $cmd=~s/offset=\d+//o;
    $cmd=~s/modulo=\d+//o;
    $cmd=~s/results_hash_dumpfile=\S+//o;
    $cmd=~s/condor_command=\S+//o;

    my $list_filename = $binary_grid::grid_options{condor_dir}.'/scripts/joinlist';
    {
	# output list of datafiles
	open(my $fp,'>',$list_filename) || confess("cannot open $list_filename to make list of datafiles that need joining");
	say {$fp} datafiles_string();
	close $fp;
    }

    $cmd=join(' ',
	      $cmd,
	      'rungrid=0',
	      'merge_datafiles_filelist='.$list_filename,
	      
	      'condor_command=join_datafiles',
	      '2>'.$binary_grid::grid_options{condor_dir}.'/stderr/join',
	      '>'.$binary_grid::grid_options{condor_dir}.'/stdout/join',
	      'condor_jobid=join',
	);

    $cmd .= "results_hash_dumpfile=$binary_grid::grid_options{condor_dir}/results/joined"
	if($binary_grid::grid_options{condor_save_joined_file});
    
    $cmd=~s/\s+/ /go;

    {
	# output command script for rerun
	my $joinscript = $binary_grid::grid_options{condor_dir}.'/scripts/join';
	open(my $fp,'>',$joinscript) || 
	    confess("cannot open joinscript $joinscript ");
	say {$fp} "\#!/bin/sh\n# Script to join binary_grid2 simulation \n$cmd\n";
	close $fp;
	chmod 0755, $joinscript;
	print "Join script which you should execute: \n$joinscript\n";
    }

    return $cmd;
}

sub condor_check_joining_file
{
    # check for joining file : 
    my $joining_file=$binary_grid::grid_options{condor_dir}.'/status/joining';
    if(-f $joining_file)
    {
	# joining file exists : a different process is joining
	say 'Some other process (PID '.slurp($joining_file).' is joining : I shall not';
	exit;
    }
    else
    {
	# touch the "joining" file
	open(my $touch,'>'.$binary_grid::grid_options{condor_dir}.'/status/joining');
	print $touch $$;
	close $touch;
    }

    sleep $binary_grid::grid_options{nfs_sleep}; # wait for the touch to take effect (prevent NFS issues)

    # check joining file has our PID
    if(-f $binary_grid::grid_options{condor_dir}.'/status/joining')
    {
	# joining file exists : is it ours?
	open(my $fp,'<'.$binary_grid::grid_options{condor_dir}.'/status/joining');
	my $n=<$fp>;
	chomp $n;
	if($n!=$$)
	{
	    say "Joining file belongs to a different process : I am doing nothing\n";
	    exit;
	}
    }
    else
    {
	confess("joining file has gone missing!\n");
    }
    return condor_rerun_command();
}


sub condor_job_hook
{
    # subroutine called after a condor-flexigrid job has finished 

    # find job id
    my $jobid=$binary_grid::grid_options{condor_jobid};

    # set job status to finished
    set_condor_job_status($jobid,
                          'finished');
    
    # save parent pid and fork child process
    my $parent_pid=$$;
    my $child_pid=fork();

    if($child_pid!=0)
    {
	# parent process
	say "condor_check : Parent process $$";
	
	# free memory (no longer required, just annoying waste of space)
	$binary_grid::grid_options{results_hash}=undef;

	# wait for the child to finish
	say "Parent: waitpid for child : $child_pid"; 
	my $r=waitpid $child_pid,0;
	
	say "Parent : waitpid returned $r\nParent: check_condor_jobs_done";

	if(check_condor_jobs_done($binary_grid::grid_options{condor_dir},
				  $binary_grid::grid_options{condor_njobs}))
	{
	    # check if the joining file exists, if yes: exits (from condor.pm)
	    # if no, returns the command to rerun the grid to join all the data
	    my $cmd=binary_grid::condor::condor_check_joining_file();

	    # if condor_postpone_join is 1 then do not do the join
	    # just output the command which would do so, this is so 
	    # you don't overload the RAM of the condor machines
	    if($binary_grid::grid_options{condor_postpone_join})
	    {
		say "Join postponed : later join the condor data sets with the following command:\n############################################################\n\n$cmd\n\n##################################################\n";
		
		# and save the command in a file scripts/joincmd.<njobs>
		my $f= $binary_grid::grid_options{condor_dir}.'/scripts/joincmd';
		open(my $fp,'>'.$f)||confess("cannot open $f for writing");
		say {$fp} $cmd;
		close $fp;
		chmod 0700, $f;
	    }
	    else
	    {
		# join in appropriate directory
		my $pwd = $binary_grid::grid_options{condor_join_pwd} // $ENV{PWD}; 
		$cmd = "cd $pwd \&\& $cmd";

		# join on another machine if required
		if(defined $binary_grid::grid_options{condor_join_machine})
		{
 		    $cmd = "ssh $binary_grid::grid_options{condor_join_machine} \"$cmd\"";
		}

		say "Join condor data sets with exec of:\n\n############################################################\n\n$cmd\n\n\n";
		`$cmd`
	    }
	}
	else
	{
	    say 'Parent : condor jobs not finished';
	}
	say 'Parent: exit';
	
	# exit : do not even return!
	exit;
    }
    else
    {
	# child process: just return and finish off as per usual
	say "condor_check : Child process $$ continuing until output() and end.";
	return;
    }
}

############################################################
############################################################

sub grid_interrupted
{
    my $jobid = $binary_grid::grid_options{condor_jobid};
    my $path = $binary_grid::grid_options{condor_dir};
    
    # SIGKILL caught, probably from condor_master
    print "grid_interrupted(): Caught SIGKILL at ",scalar localtime(),"\n";

    if($binary_grid::grid_options{flexigrid_finished})
    {
	print "Grid has finished : wait for it rather than attempt a snapshot\n";
    }
    elsif($binary_grid::grid_options{condor_snapshot_on_kill})
    { 
	# snapshot the grid and exit
	my $f = "$path/results/$jobid.sv";
	
	$binary_grid::grid_options{snapshot_file} = $f; 
	print "Sending save_snapshot action to $f at ",scalar localtime(),"\n";
	
	$binary_grid::flexigrid{actions}{save_snapshot}=1;
	
	print "Sending quit action at ",scalar localtime(),"\n";
	$binary_grid::flexigrid{actions}{quit}=1;
    }
    else
    {
	# what to do? Grid has not finished, but we want to shut down.
	# Try to do so gracefully.
	print "Sending quit action at ",scalar localtime(),"\n";
	$binary_grid::flexigrid{actions}{quit}=1;
    }

    return 0;
}

sub check_for_saved_snapshot
{
    return undef if(!$binary_grid::grid_options{condor_load_from_snapshot});

    # return 1 if a saved snapshot is found 
    my $jobid = $binary_grid::grid_options{condor_jobid};
    my $path = $binary_grid::grid_options{condor_dir};

    # first check for restart file, then checkpoint
    # because the restart is likely to be newer
    foreach my $file ("$path/results/$jobid.sv",
		      "$path/results/$jobid.ck")
    {
	# NB can be a symlink
	return $file if(-e -s $file);
    }
    return undef; # else return undef
}

sub pre_load_snapshot
{
    return undef if(!$binary_grid::grid_options{condor_load_from_snapshot});

    # return the name of the snapshot file to be loaded
    my $jobid = $binary_grid::grid_options{condor_jobid};
    my $path = $binary_grid::grid_options{condor_dir};

    # first check for restart file, then checkpoint
    # because the restart is likely to be newer
    foreach my $file ("$path/results/$jobid.sv",
		      "$path/results/$jobid.ck",
	)
    {
	print "Return snapshot filename = $file at ",scalar localtime(),"\n";
	return $file;
    }
}

sub post_load_snapshot
{
    # post snapshot load hook : remove old file
    my $file = $_[0];
    return if(!defined $file);

    my $jobid = $binary_grid::grid_options{condor_jobid};
    my $path = $binary_grid::grid_options{condor_dir};
    
    if($file =~/\.sv$/)
    {
	print "Snapshot loaded : unlink $file at ",scalar localtime(),"\n";
	unlink $file;
    }
    elsif($file=~/.ck$/)
    {
	print "Checkpoint loaded : $file remains\n";

	# don't allow an immediate checkpoint
    	increment_checkpoint_time();
    }
    else
    {
	print "Snapshot loaded : $file not unlinked because it is a checkpoint\n"; 
    }
}

sub checkpoint
{
    return if(!$binary_grid::grid_options{condor_checkpoint_interval});

    # first time : set next checkpoint time
    if(! defined $binary_grid::grid_options{condor_next_checkpoint})
    {
	increment_checkpoint_time();
    	print "Set first checkpoint time at $binary_grid::grid_options{condor_next_checkpoint}\n";
    }

    # other times, check if we should save a checkpoint
    elsif(defined $binary_grid::grid_options{condor_next_checkpoint} &&
       $binary_grid::grid_options{condor_checkpoint_interval} &&
       time() > $binary_grid::grid_options{condor_next_checkpoint})
    {
	my $jobid = $binary_grid::grid_options{condor_jobid};
	my $path = $binary_grid::grid_options{condor_dir};
	
	# timed checkpoint file
	my @time = localtime();

	# decide file name : if condor_checkpoint_stamp_times is set
	# then they are timestamped, otherwise just use $jobid.ck 
	my $f = "$path/results/$jobid.ck" .
	    ($binary_grid::grid_options{condor_checkpoint_stamp_times} ?
	     sprintf '-%02d%02d%04d-%02d%02d.%02d',
	     $time[3],$time[4],$time[5]+1900,
	     $time[2],$time[1],$time[0] : ''); 

	$binary_grid::grid_options{snapshot_file} = $f; 
	print "checkpoint(): Sending save_snapshot action to $f at ",scalar localtime(),"\n";

	# do the save and relaunch threads
	$binary_grid::flexigrid{actions}{save_snapshot}=1;
	$binary_grid::flexigrid{actions}{relaunch_threads}=1;
	
	# update next time
	increment_checkpoint_time();
	print "Set next checkpoint time at $binary_grid::grid_options{condor_next_checkpoint}\n";

	# make symlink for easy restart if required
	if($binary_grid::grid_options{condor_checkpoint_stamp_times})
	{
	    my $sym = "$path/results/$jobid.ck";
	    if(-f $sym)
	    {
		say "Warning : cannot make symlink at $sym because it is an existing file";
	    }
	    else
	    {
		unlink $sym if(-s $sym); # remove existing symlink
		symlink $f, $sym; # make new one
	    }
	}
    }
}

sub increment_checkpoint_time
{
    $binary_grid::grid_options{condor_next_checkpoint} = 
	time() + $binary_grid::grid_options{condor_checkpoint_interval};
}

sub output_allowed
{
    if($binary_grid::grid_options{condor})
    {
	if($binary_grid::grid_options{condor_command} eq 'join_datafiles' ||
           $binary_grid::grid_options{condor_command} eq 'join')
	{
	    return 1;
	}
	else
	{
	    return 0;
	}
    }
    else
    {
	return 1;
    }
}




1;

__END__
    
