amp_stats.py

#!/usr/bin/env python3

"""

amp_stats.py
------------

Post-MD Analysis with ASE & AMP

Help & Usage: python amp_stats.py -h

- Max Winokan

[ Part of GromacsOnEUREKA 
  https://gitlab.eps.surrey.ac.uk/mw00368/GromacsOnEUREKA ]

"""

import argparse

import mout # https://github.com/mwinokan/MPyTools
import mcol # https://github.com/mwinokan/MPyTools
import mplot # https://github.com/mwinokan/MPyTools

import asemolplot as amp # https://github.com/mwinokan/AseMolPlot

from ase.io.trajectory import Trajectory

import numpy as np

##########################################################################

argparser = argparse.ArgumentParser(description='Various methods for analysing GROMACS output with ASE & AMP')

argparser.add_argument("-v","--verbosity",metavar="LEVEL",type=int,default=2,help="Verbosity. Default = 2")

argparser.add_argument("-convert", type=str,metavar='PDB',help="Convert a configuration/trajectory (.pdb/.gro from gmx trjconv) to an ASE Trajectory (.traj)")
argparser.add_argument("-dhb","--dna-hydrogen-bonds",type=str,metavar="PDB",help="Plot the hydrogen bond lengths between DNA chains")
argparser.add_argument("-rcom","--residue-centre-of-mass",type=str,metavar="PDB",help="Get, set, and shift residue CoM's")

argparser.add_argument("-ts","--time-step",type=float,metavar='STEP',help="Time step in picoseconds. Mostly for graph axes.")

argparser.add_argument("-o","--output",help="Output keyword")

argparser.add_argument("-ps","--print-script", type=mout.str2bool,nargs='?',const=True,default=False,help="Print the script name in console output.")

argparser.add_argument("-rpf","--respair-file",type=str,metavar="RPF",help="Specify a file containing pairs of hydrogen bonded residues.")

argparser.add_argument("-g","--gui",help="Open an ASE GUI.",action='store_true')
argparser.add_argument("-s","--show",help="Show plots.",action='store_true')
argparser.add_argument("-pa","--plot-alternate",help="Plot alternate h-bond. [-dhb]",action='store_true')
argparser.add_argument("-pt","--plot-total",help="Plot total h-bond. [-dhb]",action='store_true')
argparser.add_argument("-on","--optimise-naughty",help="Intelligently swap bonding with H2 groups. [-dhb]",action='store_true')
# argparser.add_argument("-sh41","--swap-H41",help="Swap H41 and H42",action='store_true')
argparser.add_argument("-nt","--no-tag",help="Don't attempt to tag atoms",action='store_true')
argparser.add_argument("-set",metavar="DAT",type=str,help="Set the CoM's of two residues to values in file.")
argparser.add_argument("-shift",type=float,help="Shift the CoM's of two residues apart by the given value.")
argparser.add_argument("-fmin","--fit-min",type=float,help="Begin all fitting at. [-dhb]")
argparser.add_argument("-fmax","--fit-max",type=float,help="End all fitting at. [-dhb]")

args = argparser.parse_args()

##########################################################################

printScript = args.print_script
verbosity = args.verbosity

#### Convert PDB's to ASE Trajectories

if args.convert is not None:

  # get the input filename
  infile = args.convert

  if args.output is None:
    out_prefix = "amp_out"
    mout.warningOut("Defaulted to output keyword 'amp_out'.",printScript=printScript,code=3)
  else:
    out_prefix = args.output

  tagging = not args.no_tag

  if infile.endswith(".pdb"):
    amp.pdb2traj(infile,out_prefix+".traj",verbosity=2,tagging=tagging,printScript=printScript)
  elif infile.endswith(".gro"):
    amp.gro2traj(infile,out_prefix+".traj",verbosity=2,tagging=tagging,printScript=printScript)
  elif infile.endswith(".xyz"):
    amp.xyz2traj(infile,out_prefix+".traj",verbosity=2,printScript=printScript)
  else:
    mout.errorOut("Unrecognised file type: "+mcol.file+infile,fatal=True)

  if args.gui:
    import os
    os.system("ase gui "+out_prefix+".traj")

  exit()

#### DNA Hydrogen Bonds:

if args.dna_hydrogen_bonds is not None:

  if args.optimise_naughty is None: args.optimise_naughty = False

  ##### <ARGUMENTS> ##### 
  print_non_match = False
  print_match = False
  print_bond_candidates = False
  show_plots = args.show
  plot_alternate = args.plot_alternate
  plot_total = args.plot_total
  optimise_naughty = args.optimise_naughty
  fit_min = args.fit_min
  fit_max = args.fit_max
  fitOrder=1
  tag_warning = False
  index_adj = -1
  ##### </ARGUMENTS> ##### 

  # processes CLI arguments

  infile = args.dna_hydrogen_bonds

  if args.output is not None:
    datfile = args.output+".dat"
  else:
    mout.warningOut("Defaulted to datafile 'dhb.dat'.",printScript=printScript)
    datfile = "dhb.dat"

  output_dat = open(datfile,'w')
  output_dat.write('# name, value, (error), unit \n')
  output_dat.close()
  output_dat = open(datfile,'a')

  if infile.endswith(".pdb"):
    file_is_gro = False
  elif infile.endswith(".gro"):
    file_is_gro = True
    if args.respair_file is None:
      mout.errorOut("RPF required for .gro files, specify with "+mcol.arg+"-rpf <FILE>",fatal=True)
  else:
    mout.errorOut("Unsupported file type.",fatal=True)

  if args.output is None:
    out_prefix = "amp_out"
    mout.warningOut("Defaulted to output keyword 'amp_out'.",printScript=printScript,code=3)
  else:
    out_prefix = args.output

  mout.headerOut("Looking for DNA hydrogen-bonds in"+mcol.file+" "+infile)

  searching = True

  # initialise lists:
  dt_chainlist=[]
  dt_reslist=[]
  dt_O4_indexlist=[]
  dt_H3_indexlist=[]
  dt_N3_indexlist=[] # alternate

  da_chainlist=[]
  da_reslist=[]
  da_H61_indexlist=[]
  da_H62_indexlist=[] # naughty
  da_N1_indexlist=[]
  da_N6_indexlist=[] # alternate

  dc_chainlist=[]
  dc_reslist=[]
  dc_H41_indexlist=[]
  dc_H42_indexlist=[] # naughty
  dc_N3_indexlist=[]
  dc_O2_indexlist=[]
  dc_N4_indexlist=[] # alternate

  dg_chainlist=[]
  dg_reslist=[]
  dg_O6_indexlist=[]
  dg_H1_indexlist=[]
  dg_H21_indexlist=[]
  dg_H22_indexlist=[] # naughty
  dg_N1_indexlist=[] # alternate
  dg_N2_indexlist=[] # alternate

  # tautomers
  dt_is_tautomer=[] # tautomer
  da_is_tautomer=[] # tautomer

  taglist=[]

  """

  Naughty list:

  * DA/ADE9 H61 & H62
  * DC/CTSN H41 & H42
  * DG/GUA9 H21 & H22

  """
  
  if not file_is_gro:

    # Parse the PDB to get the properties of relevant atoms
    with open(infile,"r") as input_pdb:
      for line in input_pdb:
        if searching:
          if line.startswith("MODEL"):
            searching = False
        else:
          if line.startswith("ENDMDL"):
            break
          if line.startswith("TER"):
            continue
          else:
            stripped_line = line.strip()
            split_line = stripped_line.split()
            if len(split_line) == 10:
              # res_string = split_line[3][:-1]+split_line[4]
              res_string = line[17:22].strip()+split_line[4]
              chain_string = split_line[4]
            elif len(split_line) == 11:
              # res_string = split_line[3][:-1]+split_line[4]
              res_string = line[17:22].strip()+split_line[4]
              chain_string = split_line[3][-1]
            elif len(split_line) == 12:
              res_string = split_line[3]+split_line[5]
              chain_string = split_line[4]
            else:
              mout.errorOut("Weird PDB! # Data Columns = "+str(len(split_line)),fatal=True)
            atom_index = int(split_line[1])
            if ("ADTH" in res_string or "THTH" in res_string) and args.respair_file is None:
              mout.errorOut("Use a respair-file with tautomers!",fatal=True)
            if atom_index == 0:
              mout.warningOut("Atom indices don't start with 1!")
              index_adj=0
            if " DT" in stripped_line or "THMN" in stripped_line or "TCHG" in stripped_line or "TMUL" in stripped_line or "THCH" in stripped_line or "THTH" in stripped_line:
              if " O4 " in stripped_line:
                dt_reslist.append(res_string)
                dt_chainlist.append(chain_string)
                dt_O4_indexlist.append(atom_index)
                if res_string.startswith("THTH"):
                  dt_is_tautomer.append(True)
                else:
                  dt_is_tautomer.append(False)
                if print_match: print(mcol.bold+stripped_line+mcol.clear)
              elif " H3 " in stripped_line:
                dt_H3_indexlist.append(atom_index)
                if print_match: print(mcol.bold+stripped_line+mcol.clear)
              elif " N3 " in stripped_line:
                dt_N3_indexlist.append(atom_index)
                if print_match: print(mcol.bold+stripped_line+mcol.clear)
              elif print_non_match:
                print(stripped_line)
            elif "DA" in stripped_line or "ADE9" in stripped_line or "ACHG" in stripped_line or "AMUL" in stripped_line or "ADCH" in stripped_line or "ADTH" in stripped_line:
              if " H61 " in stripped_line:
                da_H61_indexlist.append(atom_index)
                if print_match: print(mcol.error+stripped_line+mcol.clear)
              elif " H62 " in stripped_line:
                da_H62_indexlist.append(atom_index)
                if print_match: print(mcol.error+stripped_line+mcol.clear)                
              elif " N1 " in stripped_line:
                da_reslist.append(res_string)
                if res_string.startswith("ADTH"):
                  da_is_tautomer.append(True)
                else:
                  da_is_tautomer.append(False)
                da_chainlist.append(chain_string)
                da_N1_indexlist.append(atom_index)
                if print_match: print(mcol.error+stripped_line+mcol.clear)
              elif " N6 " in stripped_line:
                da_N6_indexlist.append(atom_index)
                if print_match: print(mcol.error+stripped_line+mcol.clear)
              elif print_non_match:
                print(stripped_line)
            elif "DC" in stripped_line or "CTSN" in stripped_line:
              if " H41 " in stripped_line:
                dc_reslist.append(res_string)
                dc_chainlist.append(chain_string)
                dc_H41_indexlist.append(atom_index)
                if print_match: print(mcol.success+stripped_line+mcol.clear)
              elif " H42 " in stripped_line:
                dc_H42_indexlist.append(atom_index)
                if print_match: print(mcol.success+stripped_line+mcol.clear)
              elif " N3 " in stripped_line:
                dc_N3_indexlist.append(atom_index)
                if print_match: print(mcol.success+stripped_line+mcol.clear)
              elif " O2 " in stripped_line:
                dc_O2_indexlist.append(atom_index)
                if print_match: print(mcol.success+stripped_line+mcol.clear)
              elif " N4 " in stripped_line:
                dc_N4_indexlist.append(atom_index)
                if print_match: print(mcol.success+stripped_line+mcol.clear)
              elif print_non_match:
                print(stripped_line)
            elif "DG" in stripped_line or "GUA9" in stripped_line:
              if " O6 " in stripped_line:
                dg_reslist.append(res_string)
                dg_chainlist.append(chain_string)
                dg_O6_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif " H1 " in stripped_line:
                dg_H1_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif " H21 " in stripped_line:
                dg_H21_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif " H22 " in stripped_line:
                dg_H22_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif " N1 " in stripped_line:
                dg_N1_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif " N2 " in stripped_line:
                dg_N2_indexlist.append(atom_index)
                if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
              elif print_non_match:
                print(stripped_line)
            if args.no_tag is not None:
              tag_string = split_line[2].replace("'","")
              try:
                tag = int(''.join(filter(lambda i: i.isdigit(), tag_string)))
              except:
                if tag_warning: mout.warningOut("Untaggable atom '"+tag_string+"'")
                tag = 0  
              taglist.append(tag)

  else:

    # There is no chain information in a .gro so an RPF is also needed!

    line_number = 0
    max_line_number = 1000

    # Parse the GRO to get the properties of relevant atoms
    with open(infile,"r") as input_gro:
      for line in input_gro:
        if line_number == 1:
          max_line_number = int(line.strip().split()[0]) + 2
        if line_number == max_line_number:
          break
        if line_number > 1:
          stripped_line = line.strip()
          split_line = stripped_line.split()
          res_string = line[5:9].strip()+line[0:5].strip()
          atom_index = int(split_line[2])
          chain_string = "?"
          tag_string = split_line[1].replace("'","")
          if args.no_tag is None:
            atom_tag = int(''.join(filter(lambda i: i.isdigit(),tag_string)))
            taglist.append(atom_tag)
          
          if "DT" in stripped_line or "THMN" in stripped_line:
            if " O4 " in stripped_line:
              dt_reslist.append(res_string)
              dt_chainlist.append(chain_string)
              dt_O4_indexlist.append(atom_index)
              if print_match: print(mcol.bold+stripped_line+mcol.clear)
            elif " H3 " in stripped_line:
              dt_H3_indexlist.append(atom_index)
              if print_match: print(mcol.bold+stripped_line+mcol.clear)
            elif " N3 " in stripped_line:
              dt_N3_indexlist.append(atom_index)
              if print_match: print(mcol.bold+stripped_line+mcol.clear)
            elif print_non_match:
              print(stripped_line)
          elif "DA" in stripped_line or "ADE9" in stripped_line:
            if " H61 " in stripped_line:
              da_reslist.append(res_string)
              da_chainlist.append(chain_string)
              da_H61_indexlist.append(atom_index)
              if print_match: print(mcol.error+stripped_line+mcol.clear)
            elif " H62 " in stripped_line:
              da_H62_indexlist.append(atom_index)
              if print_match: print(mcol.error+stripped_line+mcol.clear)
            elif " N1 " in stripped_line:
              da_N1_indexlist.append(atom_index)
              if print_match: print(mcol.error+stripped_line+mcol.clear)
            elif " N6 " in stripped_line:
              da_N6_indexlist.append(atom_index)
              if print_match: print(mcol.error+stripped_line+mcol.clear)
            elif print_non_match:
              print(stripped_line)
          elif "DC" in stripped_line or "CTSN" in stripped_line:
            if " H41 " in stripped_line:
              dc_reslist.append(res_string)
              dc_chainlist.append(chain_string)
              dc_H41_indexlist.append(atom_index)
              if print_match: print(mcol.success+stripped_line+mcol.clear)
            elif " H42 " in stripped_line:
              dc_H42_indexlist.append(atom_index)
              if print_match: print(mcol.success+stripped_line+mcol.clear)
            elif " N3 " in stripped_line:
              dc_N3_indexlist.append(atom_index)
              if print_match: print(mcol.success+stripped_line+mcol.clear)
            elif " O2 " in stripped_line:
              dc_O2_indexlist.append(atom_index)
              if print_match: print(mcol.success+stripped_line+mcol.clear)
            elif " N4 " in stripped_line:
              dc_N4_indexlist.append(atom_index)
              if print_match: print(mcol.success+stripped_line+mcol.clear)
            elif print_non_match:
              print(stripped_line)
          elif "DG" in stripped_line or "GUA9" in stripped_line:
            if " O6 " in stripped_line:
              dg_reslist.append(res_string)
              dg_chainlist.append(chain_string)
              dg_O6_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
            elif " H1 " in stripped_line:
              dg_H1_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear+" "+str(atom_tag)+" "+str(atom_index))
            elif " H21 " in stripped_line:
              dg_H21_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
            elif " H22 " in stripped_line:
              dg_H22_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
            elif " N1 " in stripped_line:
              dg_N1_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
            elif " N2 " in stripped_line:
              dg_N2_indexlist.append(atom_index)
              if print_match: print(mcol.file+mcol.bold+stripped_line+mcol.clear)
            elif print_non_match:
              print(stripped_line)

        line_number = line_number + 1

  # print DNA residue count
  mout.out("Found "+mcol.result+str(len(da_chainlist))+mcol.arg+" DA Residues")
  mout.out("Found "+mcol.result+str(len(dt_chainlist))+mcol.arg+" DT Residues")
  mout.out("Found "+mcol.result+str(len(dg_chainlist))+mcol.arg+" DG Residues")
  mout.out("Found "+mcol.result+str(len(dc_chainlist))+mcol.arg+" DC Residues")

  no_AT_pairs=False
  no_GC_pairs=False

  # Do some checks:
  if len(da_chainlist) + len(dt_chainlist) + len(dg_chainlist) + len(dc_chainlist) < 2:
    mout.errorOut("Less than two bases. Maybe there are no models defined in the PDB?",fatal=True)
  if len(da_chainlist) == 0 ^ len(dt_chainlist) == 0:
    mout.warningOut("Not enough bases for AT pairs")
    no_AT_pairs=True
  if len(dg_chainlist) == 0 ^ len(dc_chainlist) == 0:
    mout.warningOut("Not enough bases for GC pairs")
    no_GC_pairs=True
  if no_AT_pairs and no_GC_pairs:
    mout.errorOut("No matching base pairs",fatal=True)

  # read in the first image of the pdb
  mout.out("Reading timestep into Atoms object...")
  atoms = amp.read(infile,index="0",printScript=False,verbosity=verbosity-1)

  if args.no_tag is not None:
    # set the tags
    for index,tag in enumerate(taglist):
      if atoms[index].tag == 0:
        atoms[index].tag = tag
      elif atoms[index].tag != tag:
        atoms[index].tag = tag
        if tag_warning: mout.warningOut("Existing tag overwritten, index="+str(index)+" old_tag="+str(atoms[index].tag)+" new_tag="+str(tag))

  if args.gui:
    amp.view(atoms)

  if args.respair_file is not None:
    # Use respairs defined in a file instead of searching.

    # Open the file
    respair_file_read = open(args.respair_file,'r').read()

    ### A-T ###

    # initialise lists:
    dt_O4_da_H61_respairs=[]
    dt_H3_da_N1_respairs=[]
    dt_O4_da_H61_bondpairs=[] # normal
    dt_H3_da_N1_bondpairs=[] # normal
    da_N6_da_H61_bondpairs=[] # alternate
    dt_H3_dt_N3_bondpairs=[] # alternate
    dt_O4_da_H62_bondpairs=[] # naughty
    dt_O4_da_N6_bondpairs=[] # total
    dt_N3_da_N1_bondpairs=[] # total

    # Loop over all the DT residues
    for dt_index,dt_residue in enumerate(dt_reslist):

      # Find the number of definitions in the RPF
      num_defs = respair_file_read.count(dt_residue+" ")

      # Correct number of definitions:
      if num_defs == 1:

        mout.out("RPF definition found for "+mcol.varName+dt_residue)

        # Loop over the RPF again
        respair_file = open(args.respair_file)
        for num,line in enumerate(respair_file,1):

          # Find the correct line in the RPF:
          if dt_residue+" " in line:

            # Get the the other residue
            residues = line.split()
            for index,residue in enumerate(residues):
              if residue != dt_residue:
                paired_res = residue
                dt_O4_da_H61_respairs.append([dt_residue,residue])
                dt_H3_da_N1_respairs.append([dt_residue,residue])
                if not residue.startswith("DA") and not residue.startswith("ADE9") and not residue.startswith("ADTH"):
                  mout.errorOut("Bad RPF definition in "+mcol.file+args.respair_file+mcol.error+". "+mcol.varName+dt_residue+mcol.error+" cannot be paired with "+mcol.varName+residue,fatal=True)

        respair_file.close()

        # Get the index of the bonded DA residue
        for da_index,da_residue in enumerate(da_reslist):
          if paired_res == da_residue:
            if not dt_is_tautomer[dt_index]: 
              dt_O4_da_H61_bondpairs.append([dt_O4_indexlist[dt_index]+index_adj,da_H61_indexlist[da_index]+index_adj])
              dt_O4_da_H62_bondpairs.append([dt_O4_indexlist[dt_index]+index_adj,da_H62_indexlist[da_index]+index_adj])
              dt_H3_da_N1_bondpairs.append([dt_H3_indexlist[dt_index]+index_adj,da_N1_indexlist[da_index]+index_adj])
              da_N6_da_H61_bondpairs.append([da_N6_indexlist[da_index]+index_adj,da_H61_indexlist[da_index]+index_adj]) # alternate
              dt_H3_dt_N3_bondpairs.append([dt_H3_indexlist[dt_index]+index_adj,dt_N3_indexlist[dt_index]+index_adj]) # alternate
            dt_O4_da_N6_bondpairs.append([dt_O4_indexlist[dt_index]+index_adj,da_N6_indexlist[da_index]+index_adj]) # total
            dt_N3_da_N1_bondpairs.append([dt_N3_indexlist[dt_index]+index_adj,da_N1_indexlist[da_index]+index_adj]) # total

      elif num_defs > 1:
        mout.errorOut(str(num_defs)+" respair definitions for "+mcol.varName+dt_residue+mcol.error+" in "+mcol.file+args.respair_file,fatal=True)
      elif num_defs < 1:
        mout.errorOut("No respair definition for "+mcol.varName+dt_residue+mcol.error+" in "+mcol.file+args.respair_file,fatal=True)

    ### G-C ###

    # initialise lists:
    dc_H41_dg_O6_respairs=[]
    dc_N3_dg_H1_respairs=[]
    dc_O2_dg_H21_respairs=[]
    dc_H41_dg_O6_bondpairs=[] # normal
    dc_N3_dg_H1_bondpairs=[] # normal
    dc_O2_dg_H21_bondpairs=[] # normal
    dc_H42_dg_O6_bondpairs=[] # naughty
    dc_O2_dg_H22_bondpairs=[] # naughty
    dc_H41_dc_N4_bondpairs=[] # alternate
    dg_N1_dg_H1_bondpairs=[] # alternate
    dg_N2_dg_H21_bondpairs=[] # alternate
    dc_N4_dg_O6_bondpairs=[] # total
    dc_N3_dg_N1_bondpairs=[] # total
    dc_O2_dg_N2_bondpairs=[] # total

    # Loop over all the DC residues
    for dc_index,dc_residue in enumerate(dc_reslist):

      # Find the number of definitions in the RPF
      num_defs = respair_file_read.count(dc_residue+" ")

      # Correct number of definitions:
      if num_defs == 1:

        mout.out("RPF definition found for "+mcol.varName+dc_residue)

        # Loop over the RPF again
        respair_file = open(args.respair_file)
        for num,line in enumerate(respair_file,1):

          # Find the correct line in the RPF:
          if dc_residue+" " in line:

            # Get the the other residue
            residues = line.split()
            for residue in residues:
              if residue != dc_residue:
                paired_res = residue
                dc_H41_dg_O6_respairs.append([dc_residue,residue])
                dc_N3_dg_H1_respairs.append([dc_residue,residue])
                dc_O2_dg_H21_respairs.append([dc_residue,residue])
                if not residue.startswith("DG") and not residue.startswith("GUA9"):
                  mout.errorOut("Bad RPF definition in "+mcol.file+args.respair_file+mcol.error+". "+mcol.varName+dt_residue+mcol.error+" cannot be paired with "+mcol.varName+residue,fatal=True)

        respair_file.close()

        # Get the index of the bonded DG residue
        for dg_index,dg_residue in enumerate(dg_reslist):
          if paired_res == dg_residue:
            dc_H41_dg_O6_bondpairs.append([dc_H41_indexlist[dc_index]+index_adj,dg_O6_indexlist[dg_index]+index_adj])
            dc_H42_dg_O6_bondpairs.append([dc_H42_indexlist[dc_index]+index_adj,dg_O6_indexlist[dg_index]+index_adj])
            dc_N3_dg_H1_bondpairs.append([dc_N3_indexlist[dc_index]+index_adj,dg_H1_indexlist[dg_index]+index_adj])
            dc_O2_dg_H21_bondpairs.append([dc_O2_indexlist[dc_index]+index_adj,dg_H21_indexlist[dg_index]+index_adj])
            dc_O2_dg_H22_bondpairs.append([dc_O2_indexlist[dc_index]+index_adj,dg_H22_indexlist[dg_index]+index_adj])
            dc_H41_dc_N4_bondpairs.append([dc_H41_indexlist[dc_index]+index_adj,dc_N4_indexlist[dc_index]+index_adj]) # alternate
            dg_N1_dg_H1_bondpairs.append([dg_N1_indexlist[dg_index]+index_adj,dg_H1_indexlist[dg_index]+index_adj]) # alternate
            dg_N2_dg_H21_bondpairs.append([dg_N2_indexlist[dg_index]+index_adj,dg_H21_indexlist[dg_index]+index_adj]) # alternate

            dc_N4_dg_O6_bondpairs.append([dc_N4_indexlist[dc_index]+index_adj,dg_O6_indexlist[dg_index]+index_adj]) # total
            dc_N3_dg_N1_bondpairs.append([dc_N3_indexlist[dc_index]+index_adj,dg_N1_indexlist[dg_index]+index_adj]) # total
            dc_O2_dg_N2_bondpairs.append([dc_O2_indexlist[dc_index]+index_adj,dg_N2_indexlist[dg_index]+index_adj]) # total

      elif num_defs > 1:
        mout.errorOut(str(num_defs)+" respair definitions for "+mcol.varName+dc_residue+mcol.error+" in "+mcol.file+args.respair_file,fatal=True)
      elif num_defs < 1:
        mout.warningOut("No respair definition for "+mcol.varName+dc_residue+mcol.warning+" in "+mcol.file+args.respair_file)

  else:

    ##################### A-T ##########################################
    ##################### A-T ##########################################
    ##################### A-T ##########################################

    mout.headerOut("Looking for A-T base pairs:")

    dt_da_maxbondnum = min([len(da_chainlist),len(dt_chainlist)])

    ###### DT:O4 ... DA:H61 ######
    
    # initialise lists
    dt_O4_da_H61_bondpairs=[]
    dt_O4_da_H61_chainpairs=[]
    dt_O4_da_H61_respairs=[]
    dt_O4_da_H61_distances=[]
    da_N6_da_H61_bondpairs=[] # alternate
    dt_O4_da_H62_bondpairs=[] # naughty
    dt_O4_da_N6_bondpairs=[] # total

    # loop over all DT residues
    for dt_index,dt_chain in enumerate(dt_chainlist):

      # get the O4 Atom object
      dt_O4_atom = atoms[dt_O4_indexlist[dt_index]+index_adj]

      # initialise arrays
      closest_distance=None

      # loop over all DA residues
      for da_index,da_H61_index in enumerate(da_H61_indexlist):

        # get the H61 Atom object
        da_H61_atom = atoms[da_H61_indexlist[da_index]+index_adj]

        # find and append the distance between the O4 and H61
        this_dist=atoms.get_distance(dt_O4_atom.index,da_H61_atom.index)

        # store information about leading candidate
        if closest_distance is None or this_dist < closest_distance:
          if dt_chainlist[dt_index] != da_chainlist[da_index]:
            closest_distance   = this_dist
            closest_index      = da_index
            closest_atom_index = da_H61_index

      # check if in same chain
      if dt_chainlist[dt_index] == da_chainlist[closest_index]:
        mout.warningOut("Residues "+dt_reslist[dt_index]+" and "+da_reslist[closest_index]+" are in the same chain!")

      # print possible bond pairs
      if print_bond_candidates:
        print(dt_reslist[dt_index]," bonded to ",da_reslist[closest_index],"? dist(O4...H61) =",closest_distance)

      # store the possible bond pairs
      dt_O4_da_H61_bondpairs.append([dt_O4_atom.index,closest_atom_index+index_adj])
      dt_O4_da_H61_chainpairs.append([dt_chainlist[dt_index],da_chainlist[closest_index]])
      dt_O4_da_H61_respairs.append([dt_reslist[dt_index],da_reslist[closest_index]])
      dt_O4_da_H61_distances.append(closest_distance)
      da_N6_da_H61_bondpairs.append([da_N6_indexlist[closest_index]+index_adj,closest_atom_index+index_adj]) # alternate
      dt_O4_da_H62_bondpairs.append([dt_O4_atom.index,da_H62_indexlist[closest_index]+index_adj]) # naughty
      dt_O4_da_N6_bondpairs.append([dt_O4_atom.index,da_N6_indexlist[closest_index]+index_adj]) # total
    
    mout.headerOut("Identified "+mcol.result+str(len(dt_O4_da_H61_bondpairs))+mcol.arg+" possible O4...H61 bonds")
    if optimise_naughty:
      mout.headerOut("Identified "+mcol.result+str(len(dt_O4_da_H62_bondpairs))+mcol.arg+" possible O4...H62 bonds"+mcol.warning+" *")

    ###### DT:H3 ... DA:N1 ######

    # initialise lists
    dt_H3_da_N1_bondpairs=[]
    dt_H3_da_N1_chainpairs=[]
    dt_H3_da_N1_respairs=[]
    dt_H3_da_N1_distances=[]
    dt_H3_dt_N3_bondpairs=[] # alternate
    dt_N3_da_N1_bondpairs=[] # total

    # loop over all DT residues
    for dt_index,dt_chain in enumerate(dt_chainlist):

      # get the H3 Atom object
      dt_H3_atom = atoms[dt_H3_indexlist[dt_index]+index_adj]

      # initialise arrays
      closest_distance=None

      # loop over all DA residues
      for da_index,da_N1_index in enumerate(da_N1_indexlist):

        # get the N1 Atom object
        da_N1_atom = atoms[da_N1_indexlist[da_index]+index_adj]

        # find and append the distance between the H3 and N1
        this_dist=atoms.get_distance(dt_H3_atom.index,da_N1_atom.index)

        # store information about leading candidate
        if closest_distance is None or this_dist < closest_distance:
          if dt_chainlist[dt_index] != da_chainlist[da_index]:
            closest_distance   = this_dist
            closest_index      = da_index
            closest_atom_index = da_N1_index

      # check if in same chain
      if dt_chainlist[dt_index] == da_chainlist[closest_index]:
        mout.warningOut("Residues "+dt_reslist[dt_index]+" and "+da_reslist[closest_index]+" are in the same chain!")

      # print possible bond pairs
      if print_bond_candidates:
        print(dt_reslist[dt_index]," bonded to ",da_reslist[closest_index],"? dist(H3...N1) =",closest_distance)

      # store the possible bond pairs
      dt_H3_da_N1_bondpairs.append([dt_H3_atom.index,closest_atom_index+index_adj])
      dt_H3_da_N1_chainpairs.append([dt_chainlist[dt_index],da_chainlist[closest_index]])
      dt_H3_da_N1_respairs.append([dt_reslist[dt_index],da_reslist[closest_index]])
      dt_H3_da_N1_distances.append(closest_distance)
      dt_H3_dt_N3_bondpairs.append([dt_H3_atom.index,dt_N3_indexlist[dt_index]+index_adj]) # alternate
      dt_N3_da_N1_bondpairs.append([dt_N3_indexlist[dt_index]+index_adj,closest_atom_index+index_adj]) # total

    mout.headerOut("Identified "+mcol.result+str(len(dt_H3_da_N1_bondpairs))+mcol.arg+" possible H3...N1 bonds")

    ###### Check for problems

    # i.e. when there are unequal numbers of bonds between A-T's
    if len(dt_O4_da_H61_bondpairs) != len(dt_H3_da_N1_bondpairs):
      mout.errorOut("Number of bondpairs not equal!",fatal=True)

    # i.e. identify when a DNA base wants to bond to more than one complementary base
    for index,respair in enumerate(dt_O4_da_H61_respairs):

      # get the other residue pair
      dt_H3_da_N1_respair = dt_H3_da_N1_respairs[index]

      if respair != dt_H3_da_N1_respair:
        mout.warningOut("Bad bond match!")
    
    ###### Remove longest bonds

    # when there are too many base-pairs
    if len(dt_O4_da_H61_bondpairs) > dt_da_maxbondnum:

      # number of bonds to remove
      bondtrimnum=-dt_da_maxbondnum+len(dt_O4_da_H61_bondpairs)

      # write warning
      mout.warningOut("Should be no more than "+str(dt_da_maxbondnum)+"! Will remove longest "+str(bondtrimnum)+" bonds.")

      # initialise lists
      combined_distances=[]

      # sum distances for the two hydrogen bonds
      for index,distance in enumerate(dt_O4_da_H61_distances):
        combined_distances.append(distance+dt_H3_da_N1_distances[index])

      # remove the longest bonds
      for index in range(0,bondtrimnum):

        # find longest bond
        longest_distance=max(combined_distances)
        longest_index=combined_distances.index(longest_distance)

        # remove the longest bond
        del combined_distances[longest_index]
        del dt_O4_da_H61_bondpairs[longest_index]
        del dt_O4_da_H61_chainpairs[longest_index]
        del dt_O4_da_H61_respairs[longest_index]
        del dt_O4_da_H61_distances[longest_index]
        del dt_H3_da_N1_bondpairs[longest_index]
        del dt_H3_da_N1_chainpairs[longest_index]
        del dt_H3_da_N1_respairs[longest_index]
        del dt_H3_da_N1_distances[longest_index]
        del dt_H3_dt_N3_bondpairs[longest_index] # alternate
        del da_N6_da_H61_bondpairs[longest_index] # alternate
        del dt_O4_da_H62_bondpairs[longest_index] # naughty
        del dt_O4_da_N6_bondpairs[longest_distance] # total
        del dt_N3_da_N1_bondpairs[longest_distance] # total

    ##################### G-C ##########################################
    ##################### G-C ##########################################
    ##################### G-C ##########################################

    mout.headerOut("Looking for G-C base pairs:")

    dc_gc_maxbondnum = min([len(dc_chainlist),len(dg_chainlist)])

    ###### DC:H41 ... DG:O6 ######
    
    # initialise lists
    dc_H41_dg_O6_bondpairs=[]
    dc_H41_dg_O6_chainpairs=[]
    dc_H41_dg_O6_respairs=[]
    dc_H41_dg_O6_distances=[]
    dc_H41_dc_N4_bondpairs=[] # alternate
    dc_H42_dg_O6_bondpairs=[] # naughty
    dc_N4_dg_O6_bondpairs=[] # total

    # loop over all DC residues
    for dc_index,dc_chain in enumerate(dc_chainlist):

      # get the O4 Atom object
      dc_H41_atom = atoms[dc_H41_indexlist[dc_index]+index_adj]

      # initialise arrays
      closest_distance=None

      # loop over all DG residues
      for dg_index,dg_O6_index in enumerate(dg_O6_indexlist):

        # get the H61 Atom object
        dg_O6_atom = atoms[dg_O6_indexlist[dg_index]+index_adj]

        # find the distance between the H41 and O6 atoms
        this_dist=atoms.get_distance(dc_H41_atom.index,dg_O6_atom.index)

        # store information about leading candidate
        if closest_distance is None or this_dist < closest_distance:
          if dc_chainlist[dc_index] != dg_chainlist[dg_index]:
            closest_distance   = this_dist
            closest_index      = dg_index
            closest_atom_index = dg_O6_index

      # check if in same chain
      if dc_chainlist[dc_index] == dg_chainlist[closest_index]:
        mout.warningOut("Residues "+dc_reslist[dc_index]+" and "+dg_reslist[closest_index]+" are in the same chain!")

      # print possible bond pairs
      if print_bond_candidates:
        print(dc_reslist[dc_index]," bonded to ",dg_reslist[closest_index],"? dist(H41...O6) =",closest_distance)

      # store the possible bond pairs
      dc_H41_dg_O6_bondpairs.append([dc_H41_atom.index,closest_atom_index+index_adj])
      dc_H41_dg_O6_chainpairs.append([dc_chainlist[dc_index],dg_chainlist[closest_index]])
      dc_H41_dg_O6_respairs.append([dc_reslist[dc_index],dg_reslist[closest_index]])
      dc_H41_dg_O6_distances.append(closest_distance)
      dc_H41_dc_N4_bondpairs.append([dc_H41_atom.index,dc_N4_indexlist[dc_index]+index_adj]) # alternate
      dc_H42_dg_O6_bondpairs.append([dc_H42_indexlist[dc_index]+index_adj,closest_atom_index+index_adj]) # naughty
      dc_N4_dg_O6_bondpairs.append([dc_N4_indexlist[dc_index]+index_adj,closest_atom_index+index_adj]) # total
    
    mout.headerOut("Identified "+mcol.result+str(len(dc_H41_dg_O6_bondpairs))+mcol.arg+" possible H41...O6 bonds")
    if optimise_naughty:
      mout.headerOut("Identified "+mcol.result+str(len(dc_H42_dg_O6_bondpairs))+mcol.arg+" possible H42...O6 bonds"+mcol.warning+" *")

    ###### DC:N3 ... DG:H1 ######

    # initialise lists
    dc_N3_dg_H1_bondpairs=[]
    dc_N3_dg_H1_chainpairs=[]
    dc_N3_dg_H1_respairs=[]
    dc_N3_dg_H1_distances=[]
    dg_N1_dg_H1_bondpairs=[] # alternate
    dc_N3_dg_N1_bondpairs=[] # total

    # loop over all DC residues
    for dc_index,dc_chain in enumerate(dc_chainlist):

      # get the N3 Atom object
      dc_N3_atom = atoms[dc_N3_indexlist[dc_index]+index_adj]

      # initialise arrays
      closest_distance=None

      # loop over all DG residues
      for dg_index,dg_H1_index in enumerate(dg_H1_indexlist):

        # get the H61 Atom object
        dg_H1_atom = atoms[dg_H1_indexlist[dg_index]+index_adj]

        # find the distance between the N3 and H1 atoms
        this_dist=atoms.get_distance(dc_N3_atom.index,dg_H1_atom.index)

        # store information about leading candidate
        if closest_distance is None or this_dist < closest_distance:
          if dc_chainlist[dc_index] != dg_chainlist[dg_index]:
            closest_distance   = this_dist
            closest_index      = dg_index
            closest_atom_index = dg_H1_index

      # check if in same chain
      if dc_chainlist[dc_index] == dg_chainlist[closest_index]:
        mout.warningOut("Residues "+dc_reslist[dc_index]+" and "+dg_reslist[closest_index]+" are in the same chain!")

      # print possible bond pairs
      if print_bond_candidates:
        print(dc_reslist[dc_index]," bonded to ",dg_reslist[closest_index],"? dist(N3...H1) =",closest_distance)

      # store the possible bond pairs
      dc_N3_dg_H1_bondpairs.append([dc_N3_atom.index,closest_atom_index+index_adj])
      dc_N3_dg_H1_chainpairs.append([dc_chainlist[dc_index],dg_chainlist[closest_index]])
      dc_N3_dg_H1_respairs.append([dc_reslist[dc_index],dg_reslist[closest_index]])
      dc_N3_dg_H1_distances.append(closest_distance)
      dg_N1_dg_H1_bondpairs.append([dg_N1_indexlist[closest_index]+index_adj,closest_atom_index+index_adj]) # alternate
      dc_N3_dg_N1_bondpairs.append([dc_N3_atom.index,dg_N1_indexlist[closest_index]+index_adj]) # total
    
    mout.headerOut("Identified "+mcol.result+str(len(dc_N3_dg_H1_bondpairs))+mcol.arg+" possible N3...H1 bonds")

    ###### DC:O2 ... DG:H21 ######

    # initialise lists
    dc_O2_dg_H21_bondpairs=[]
    dc_O2_dg_H21_chainpairs=[]
    dc_O2_dg_H21_respairs=[]
    dc_O2_dg_H21_distances=[]
    dg_N2_dg_H21_bondpairs=[] # alternate
    dc_O2_dg_H22_bondpairs=[] # naughty
    dc_O2_dg_N2_bondpairs=[] # total

    # loop over all DC residues
    for dc_index,dc_chain in enumerate(dc_chainlist):

      # get the O2 Atom object
      dc_O2_atom = atoms[dc_O2_indexlist[dc_index]+index_adj]

      # initialise arrays
      closest_distance=None

      # loop over all DG residues
      for dg_index,dg_H21_index in enumerate(dg_H21_indexlist):

        # get the H61 Atom object
        dg_H21_atom = atoms[dg_H21_indexlist[dg_index]+index_adj]

        # find the distance between the O2 and H21 atoms
        this_dist=atoms.get_distance(dc_O2_atom.index,dg_H21_atom.index)

        # store information about leading candidate
        if closest_distance is None or this_dist < closest_distance:
          if dc_chainlist[dc_index] != dg_chainlist[dg_index]:
            closest_distance   = this_dist
            closest_index      = dg_index
            closest_atom_index = dg_H21_index

      # check if in same chain
      if dc_chainlist[dc_index] == dg_chainlist[closest_index]:
        mout.warningOut("Residues "+dc_reslist[dc_index]+" and "+dg_reslist[closest_index]+" are in the same chain!")

      # print possible bond pairs
      if print_bond_candidates:
        print(dc_reslist[dc_index]," bonded to ",dg_reslist[closest_index],"? dist(O2...H21) =",closest_distance)

      # store the possible bond pairs
      dc_O2_dg_H21_bondpairs.append([dc_O2_atom.index,closest_atom_index+index_adj])
      dc_O2_dg_H21_chainpairs.append([dc_chainlist[dc_index],dg_chainlist[closest_index]])
      dc_O2_dg_H21_respairs.append([dc_reslist[dc_index],dg_reslist[closest_index]])
      dc_O2_dg_H21_distances.append(closest_distance)
      dg_N2_dg_H21_bondpairs.append([dg_N2_indexlist[closest_index]+index_adj,closest_atom_index+index_adj]) # alternate
      dc_O2_dg_H22_bondpairs.append([dc_O2_atom.index,dg_H22_indexlist[closest_index]+index_adj]) # naughty
      dc_O2_dg_N2_bondpairs.append([dc_O2_atom.index,dg_N2_indexlist[closest_index]+index_adj]) # total
    
    mout.headerOut("Identified "+mcol.result+str(len(dc_O2_dg_H21_bondpairs))+mcol.arg+" possible O2...H21 bonds")
    if optimise_naughty:
      mout.headerOut("Identified "+mcol.result+str(len(dc_O2_dg_H22_bondpairs))+mcol.arg+" possible O2...H22 bonds"+mcol.warning+" *")

    ###### Check for problems

    # i.e. when there are unequal numbers of bonds between A-T's
    if len(dc_H41_dg_O6_bondpairs) != len(dc_N3_dg_H1_bondpairs) or len(dc_N3_dg_H1_bondpairs) != len(dc_O2_dg_H21_bondpairs):
      mout.errorOut("Number of bondpairs not equal!",fatal=True)

    # i.e. identify when a DNA base wants to bond to more than one complementary base
    for index,respair in enumerate(dc_H41_dg_O6_respairs):

      # get the other residue pair
      dc_N3_dg_H1_respair = dc_N3_dg_H1_respairs[index]
      dc_O2_dg_H21_respair = dc_O2_dg_H21_respairs[index]

      if respair != dc_N3_dg_H1_respair or dc_N3_dg_H1_respair != dc_O2_dg_H21_respair:
        mout.warningOut("Bad bond match!")

    ###### Remove longest bonds

    # when there are too many base-pairs
    if len(dc_H41_dg_O6_bondpairs) > dc_gc_maxbondnum:

      # number of bonds to remove
      bondtrimnum=-dc_gc_maxbondnum+len(dc_H41_dg_O6_bondpairs)

      # write warning
      mout.warningOut("Should be no more than "+str(dc_gc_maxbondnum)+"! Will remove longest "+str(bondtrimnum)+" bonds.")

      # initialise lists
      combined_distances=[]

      # sum distances for the three hydrogen bonds
      for index,distance in enumerate(dc_H41_dg_O6_distances):
        combined_distances.append(distance+dc_N3_dg_H1_distances[index]+dc_O2_dg_H21_distances[index])

      # remove the longest bonds
      for index in range(0,bondtrimnum):

        # find longest bond
        longest_distance=max(combined_distances)
        longest_index=combined_distances.index(longest_distance)

        # remove the longest bond
        del combined_distances[longest_index]
        del dc_H41_dg_O6_bondpairs[longest_index]
        del dc_H41_dg_O6_chainpairs[longest_index]
        del dc_H41_dg_O6_respairs[longest_index]
        del dc_H41_dg_O6_distances[longest_index]
        del dc_N3_dg_H1_bondpairs[longest_index]
        del dc_N3_dg_H1_chainpairs[longest_index]