Mercurial > repos > rliterman > csp2

//------------------------------------------------------------------------------
//   Programmer: Adam M Phillippy, The Institute for Genomic Research
//         File: postpro.cc
//         Date: 08 / 20 / 2002
//
//      Purpose: To translate the coordinates referencing the concatenated
//              reference sequences back to the individual sequences and deal
//             with any conflict that may have arisen (i.e. a MUM that spans
//            the boundry between two sequences). Then to extend each cluster
//           via Smith-Waterman techniques to expand the alignment coverage.
//          Alignments which encounter each other will be fused to form one
//         encompasing alignment when appropriate.
//
//        Input: Input is the output of the .mgaps program from stdin. On the
//              command line, the file names of the two original sequence files
//             should come first, followed by the prefix <pfx> that should be
//            placed in front of the two output filenames <pfx>.cluster and
//           <pfx>.delta
//
// NOTE: Cluster file is now suppressed by default (see -d option).
//
//       Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
//              <pfx>.cluster lists MUM clusters as identified by "mgaps".
//             However, the clusters now reference their corresponding
//            sequence and are all listed under headers that specify this
//           sequence. In addition, the coordinates now reference the original
//          DNA input and not the amino acid translations. The <pfx>.delta file
//         is the alignment object that contains all the information necessary
//        to reproduce the alignments generated by the MUM extension process.
//       The coordinates in this file reference the original DNA input, however
//      the delta values represent amino acids, so 1 delta = 3 nucleotides.
//     Please refer to the output file documentation for an in-depth
//    description of these file formats.
//
//        Usage: postpro  <reference>  <query>  <pfx>  <  <input>
//           Where <reference> and <query> are the original input sequences of
//          PROmer and <pfx> is the prefix that should be added to the
//         beginning of the <pfx>.cluster and <pfx>.delta output filenames.
//
//------------------------------------------------------------------------------

//-- NOTE: this option will significantly hamper program performance,
//         mostly the alignment extension performance (sw_align.h)
//#define _DEBUG_ASSERT       // self testing assert functions

#include "tigrinc.hh"
#include "sw_align.hh"
#include "translate.hh"
#include <vector>
#include <algorithm>
using namespace std;


//------------------------------------------------------ Globals -------------//
bool DO_DELTA = true;
bool DO_EXTEND = true;
bool TO_SEQEND = false;


//------------------------------------------------------ Type Definitions ----//
enum LineType
     //-- The type of input line from <stdin>
{
  NO_LINE, HEADER_LINE, MATCH_LINE
};


struct FastaRecord
     //-- The essential data of a sequence
{
  char * Id;                 // the fasta ID header tag
  long int len;              // the length of the sequence
  char * seq;                // the sequence data
};


struct Match
     //-- An exact match between two sequences A and B
{
  long int sA, sB, len;      // start coordinate in A, in B and the length
};


struct Cluster
     //-- An ordered list of matches between two sequences A and B
{
  bool wasFused;          // have the cluster matches been extended yet?
  int frameA;                // the reference sequence frame (1-6)
  int frameB;                // the query sequence frame
  vector<Match> matches;     // the ordered set of matches in the cluster
};


struct Synteny
     //-- An ordered list of clusters between two sequences A and B
{
  FastaRecord * AfP;         // a pointer to the reference sequence record
  FastaRecord Bf;            // the query sequence record (w/o the sequence)
  vector<Cluster> clusters;  // the ordered set of clusters between A and B
};


struct Alignment
     //-- An alignment object between two sequences A and B
{
  int frameA;                // the reference sequence frame (1-6)
  int frameB;                // the query sequence frame
  long int sA, sB, eA, eB;   // the start in A, B and the end in A, B
  vector<long int> delta;         // the delta values, with NO zero at the end
  long int deltaApos;        // sum of abs(deltas) - #of negative deltas
                             //      trust me, it is a very helpful value
  long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
};


struct AscendingClusterSort
     //-- For sorting clusters in ascending order of their sA coordinate
{
  bool operator() (const Cluster & pA, const Cluster & pB)
  {
    return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
  }
};


//------------------------------------------------- Function Declarations ----//
void addNewAlignment
     (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
      vector<Match>::iterator Mp);

bool extendBackward
     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
      vector<Alignment>::iterator TargetAp, const char * A, const char * B);

bool extendForward
     (vector<Alignment>::iterator Ap, const char * A, long int targetA,
      const char * B, long int targetB, unsigned int m_o);

void extendClusters
     (vector<Cluster> & Clusters,
      const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile);

void flushAlignments
     (vector<Alignment> & Alignments,
      const FastaRecord * Af, const FastaRecord * Bf,
      FILE * DeltaFile);

void flushSyntenys
     (vector<Synteny> & Syntenys, FILE * ClusterFile);

vector<Cluster>::iterator getForwardTargetCluster
     (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
      long int & targetA, long int & targetB);

vector<Alignment>::iterator getReverseTargetAlignment
     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);

bool isShadowedCluster
     (vector<Cluster>::iterator CurrCp,
      vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);

void parseAbort
     (const char *);

void parseDelta
     (vector<Alignment> & Alignments,
      const FastaRecord * Af, const FastaRecord *Bf);

void processSyntenys
     (vector<Synteny> & Syntenys,
      FastaRecord * Af, long int As,
      FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);

inline long int transC
     (long int Coord, int Frame, long int Len);

inline long int refLen
     (long int ntLen);

inline long int revC
     (long int Coord, long int Len);

void printHelp
     (const char *);

void printUsage
     (const char *);

void validateData
     (vector<Alignment> Alignments, vector<Cluster> Clusters,
      const FastaRecord * Af, const FastaRecord * Bf);


//-------------------------------------------------- Function Definitions ----//
int main
     (int argc, char ** argv)
{
  FastaRecord * Af;           // array of all the reference sequences

  vector<Synteny> Syntenys;                  // vector of all sets of clusters
  vector<Synteny>::reverse_iterator CurrSp;  // current set of clusters
  vector<Synteny>::reverse_iterator Sp;      // temporary synteny pointer

  Synteny Asyn;               // a single set of clusters
  Cluster Aclu;               // a single cluster of matches
  Match Amat;                 // a single match

  LineType PrevLine;          // the current input line

  bool Found;                 // temporary search flag

  char Line [MAX_LINE];       // a single line of input
  char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE];   // fasta ID headers
  char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
  char RefFileName [MAX_LINE], QryFileName [MAX_LINE];

  int FrameA, FrameB;                // reading frames (1-6)
  int CurrFrameA, CurrFrameB;

  long int i;                        // temporary counter
  long int Seqi;                     // current reference sequence index
  long int InitSize;                 // initial sequence memory space
  long int As = 0;                   // the size of the reference array
  long int Ac = 100;                 // the capacity of the reference array
  long int sA, sB, len;              // current match start in A, B and length

  FILE * RefFile, * QryFile;         // reference and query input files
  FILE * ClusterFile, * DeltaFile;   // cluster and delta output files

  //-- Set the alignment data type and break length (sw_align.h)
  setMatrixType ( BLOSUM62 );
  setBreakLen ( 60 );

  //-- Parse the command line arguments
  {
    optarg = NULL;
    int ch, errflg = 0;
    while ( !errflg  &&  ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) )
      switch (ch)
        {
        case 'b' :
          setBreakLen (atoi (optarg));
          break;

	case 'd' :
	  DO_DELTA = false;
	  break;

	case 'e' :
	  DO_EXTEND = false;
	  break;

        case 'h' :
          printHelp (argv[0]);
          exit (EXIT_SUCCESS);
          break;

	case 't' :
	  TO_SEQEND = true;
	  break;

	case 'x' :
	  setMatrixType ( atoi (optarg) );
	  if ( getMatrixType( ) == NUCLEOTIDE )
	    {
	      fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n",
		       getMatrixType( ));
	      setMatrixType ( BLOSUM62 );
	    }
	  break;

        default :
          errflg ++;
        }

    if ( errflg > 0 || argc - optind != 3 )
      {
        printUsage (argv[0]);
        exit (EXIT_FAILURE);
      }
  }

  //-- Read and create the I/O file names
  strcpy (RefFileName, argv[optind ++]);
  strcpy (QryFileName, argv[optind ++]);
  strcpy (ClusterFileName, argv[optind ++]);
  strcpy (DeltaFileName, ClusterFileName);
  strcat (ClusterFileName, ".cluster");
  strcat (DeltaFileName, ".delta");

  //-- Open all the files
  RefFile = File_Open (RefFileName, "r");
  QryFile = File_Open (QryFileName, "r");
  if ( DO_DELTA )
    {
      ClusterFile = NULL;
      DeltaFile = File_Open (DeltaFileName, "w");
    }
  else
    {
      ClusterFile = File_Open (ClusterFileName, "w");
      DeltaFile = NULL;
    }

  //-- Print the headers of the output files
  if ( DO_DELTA )
    fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
  else
    fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName);

  //-- Generate the array of the reference sequences
  InitSize = INIT_SIZE;
  Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
  Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
  while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
    {
      Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
      strcpy (Af[As].Id, IdA);

      Af[As].len = strlen (Af[As].seq + 1);

      if ( ++ As >= Ac )
        {
          Ac *= 2;
          Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
        }

      InitSize = INIT_SIZE;
      Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
    }
  fclose (RefFile);
  if ( As <= 0 )
    parseAbort ( RefFileName );


  //-- Process the input from <stdin> line by line
  PrevLine = NO_LINE;
  IdA[0] = '\0';
  IdB[0] = '\0';
  FrameA = 0;
  FrameB = 0;
  CurrFrameA = -1;
  CurrFrameB = -1;
  CurrIdB[0] = '\0';
  while ( fgets(Line, MAX_LINE, stdin) != NULL )
    {

      //-- If the current line is a fasta HEADER_LINE
      if ( Line[0] == '>' )
        {
          if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
            parseAbort ("stdin");

	  sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB);
	  CurrIdB [strlen(CurrIdB) - 2] = '\0';

          PrevLine = HEADER_LINE;
        }


      //-- If the current line is a cluster HEADER_LINE
      else if ( Line[0] == '#' )
        {
          PrevLine = HEADER_LINE;
        }


      //-- If the current line is a MATCH_LINE
      else
        {
          if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
            parseAbort ("stdin");

          //-- Re-map the reference coordinate back to its original sequence
	  //   NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length
	  //   including the appended 'X' on each frame translation
          for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ )
	    sA -= refLen (Af[Seqi].len);

	  assert ( Seqi < As );

	  //-- Get the correct frame, translate startA coordinate to frame
	  for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ )
	    sA -= (Af[Seqi].len - (i % 3)) / 3 + 1;
	  CurrFrameA = i + 1;
	  assert ( CurrFrameA > 0  &&  CurrFrameA < 7 );

	  //-- If the match spans across a frame boundry
	  if ( CurrFrameA < 1  ||  CurrFrameA > 6  ||
	       sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 ||
	       sA <= 0 )
	    {
	      fprintf (stderr,
                 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
                 "         Reference sequence '>%s', frame %d\n"
                 "Please file a bug report\n"
                 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA);
              continue;
	    }

          //-- If the match spans across a sequence boundry
          if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 )
            {
              fprintf (stderr,
                 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
                 "         Reference sequence '>%s'\n"
                 "Please file a bug report\n"
                 "Attempting to continue.\n", Af[Seqi].Id);
              continue;
            }

          //-- Check and update the current synteny region
          if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ||
	       CurrFrameA != FrameA || CurrFrameB != FrameB )
            {
              Found = false;
              if ( strcmp (IdB, CurrIdB) == 0 )
                {
                  //-- Has this header been seen before?
                  for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
                    {
                      if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
                        {
                          assert ( Sp->AfP->len == Af[Seqi].len );
                          assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
                          CurrSp = Sp;
                          Found = true;
                          break;
                        }
                    }
                }
              else
                {
                  //-- New B sequence header, process all the old synteny's
                  processSyntenys (Syntenys, Af, As,
                                   QryFile, ClusterFile, DeltaFile);

                }

              strcpy (IdA, Af[Seqi].Id);
              strcpy (IdB, CurrIdB);
	      FrameA = CurrFrameA;
	      FrameB = CurrFrameB;

              //-- If not seen yet, create a new synteny region
              if ( ! Found )
                {
                  Asyn.AfP = Af + Seqi;
                  Asyn.Bf.len = -1;
		  Asyn.Bf.Id = (char *) Safe_malloc
		    (sizeof(char) * (strlen(IdB) + 1));
                  strcpy (Asyn.Bf.Id, IdB);

                  Syntenys.push_back (Asyn);
                  CurrSp = Syntenys.rbegin( );
                }

              //-- Add a new cluster to the current synteny
              if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
		if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
		  CurrSp->clusters.pop_back( ); // hack to remove empties
	      Aclu.frameA = FrameA;
	      Aclu.frameB = FrameB;
              Aclu.wasFused = false;
              CurrSp->clusters.push_back (Aclu);
            }
          else if ( PrevLine == HEADER_LINE )
            {
              //-- Add a new cluster to the current synteny
              if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
		if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
		  CurrSp->clusters.pop_back( );
	      Aclu.frameA = FrameA;
	      Aclu.frameB = FrameB;
              Aclu.wasFused = false;
              CurrSp->clusters.push_back (Aclu);
            }

          //-- Add a new match to the current cluster
	  //   NOTE: A and B coordinates still reference the appropriate
	  //   amino acid sequence frame, not the DNA (same with len)
	  if ( len > 1 )
	    {
	      Amat.sA = sA;
	      Amat.sB = sB;
	      Amat.len = len;
	      CurrSp->clusters.rbegin( )->matches.push_back (Amat);
	    }

          PrevLine = MATCH_LINE;
        }
    }

  //-- Process the left-over syntenys
  if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
    if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
      CurrSp->clusters.pop_back( );
  processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
  fclose (QryFile);

  //-- Free the reference sequences
  for ( i = 0; i < As; i ++ )
    {
      free (Af[i].Id);
      free (Af[i].seq);
    }
  free (Af);

  return EXIT_SUCCESS;
}


void addNewAlignment
     (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
      vector<Match>::iterator Mp)

     //  Create and add a new alignment object based on the current match
     //  and cluster information pointed to by Cp and Mp.

{
  Alignment Align;

  //-- Initialize the data
  Align.sA = Mp->sA;
  Align.sB = Mp->sB;
  Align.eA = Mp->sA + Mp->len - 1;
  Align.eB = Mp->sB + Mp->len - 1;
  Align.frameA = Cp->frameA;
  Align.frameB = Cp->frameB;
  Align.delta.clear( );
  Align.deltaApos = 0;

  //-- Add the alignment object
  Alignments.push_back (Align);

  return;
}


bool extendBackward
     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
      vector<Alignment>::iterator TargetAp, const char * A, const char * B)

     //  Extend an alignment backwards off of the current alignment object.
     //  The current alignment object must be freshly created and consist
     //  only of an exact match (i.e. the delta vector MUST be empty).
     //  If the TargetAp alignment object is reached by the extension, it will
     //  be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
     //  NULL the function will extend as far as possible. It is a strange
     //  and dangerous function because it can delete CurrAp, so edit with
     //  caution. Returns true if TargetAp was reached and merged, else false
     //  Designed only as a subroutine for extendClusters, should be used
     //  nowhere else.

{
  bool target_reached = false;
  bool overflow_flag = false;
  bool double_flag = false;

  vector<long int>::iterator Dp;

  unsigned int m_o;
  long int targetA, targetB;

  m_o = BACKWARD_SEARCH;

  //-- Set the target coordinates
  if ( TargetAp != Alignments.end( ) )
    {
      targetA = TargetAp->eA;
      targetB = TargetAp->eB;
    }
  else
    {
      targetA = 1;
      targetB = 1;
      m_o |= OPTIMAL_BIT;
    }

  //-- If alignment is too long, bring with bounds and set overflow_flag true
  if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
    {
      targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
      overflow_flag = true;
      m_o |= OPTIMAL_BIT;
    }
  if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
    {
      targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
      if ( overflow_flag )
	double_flag = true;
      else
	overflow_flag = true;
      m_o |= OPTIMAL_BIT;
    }


  if ( TO_SEQEND && !double_flag )
    m_o |= SEQEND_BIT;


  //-- Attempt to reach the target
  target_reached = alignSearch (A, CurrAp->sA, targetA,
                                B, CurrAp->sB, targetB, m_o);

  if ( overflow_flag  ||  TargetAp == Alignments.end( ) )
    target_reached = false;

  if ( target_reached )
    {
      //-- assert that CurrAp is new and at the end of the Alignment vector
      assert ( CurrAp->delta.empty( ) );
      assert ( CurrAp == Alignments.end( ) - 1 );

      //-- Merge the two alignment objects
      extendForward (TargetAp, A, CurrAp->sA,
                               B, CurrAp->sB, FORCED_FORWARD_ALIGN);
      TargetAp->eA = CurrAp->eA;
      TargetAp->eB = CurrAp->eB;
      Alignments.pop_back( );
    }
  else
    {
      alignTarget (A, targetA, CurrAp->sA,
                   B, targetB, CurrAp->sB,
                   CurrAp->delta, FORCED_FORWARD_ALIGN);
      CurrAp->sA = targetA;
      CurrAp->sB = targetB;

      //-- Update the deltaApos value for the alignment object
      for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
        CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
    }

  return target_reached;
}


bool extendForward
     (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
      const char * B, long int targetB, unsigned int m_o)

     //  Extend an alignment forwards off the current alignment object until
     //  target or end of sequence is reached, and merge the delta values of the
     //  alignment object with the new delta values generated by the extension.
     //  Return true if the target was reached, else false

{
  long int ValA;
  long int ValB;
  unsigned int Di;
  bool target_reached;
  bool overflow_flag = false;
  bool double_flag = false;
  vector<long int>::iterator Dp;

  //-- Set Di to the end of the delta vector
  Di = CurrAp->delta.size( );

  //-- Calculate the distance between the start and end positions
  ValA = targetA - CurrAp->eA + 1;
  ValB = targetB - CurrAp->eB + 1;

  //-- If the distance is too long, shrink it and set the overflow_flag
  if ( ValA > MAX_ALIGNMENT_LENGTH )
    {
      targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
      overflow_flag = true;
      m_o |= OPTIMAL_BIT;
    }
  if ( ValB > MAX_ALIGNMENT_LENGTH )
    {
      targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
      if ( overflow_flag )
	double_flag = true;
      else
	overflow_flag = true;
      m_o |= OPTIMAL_BIT;
    }

  if ( double_flag )
    m_o &= ~SEQEND_BIT;

  target_reached = alignTarget (A, CurrAp->eA, targetA,
                                B, CurrAp->eB, targetB,
                                CurrAp->delta, m_o);

  //-- Notify user if alignment was chopped short
  if ( target_reached  &&  overflow_flag )
    target_reached = false;

  //-- Pick up delta where left off (Di) and merge with new deltas
  if ( Di < CurrAp->delta.size( ) )
    {
      //-- Merge the deltas
      ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
      CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
      if ( CurrAp->delta[Di] == 0  ||  ValA < 0 )
	{
	  fprintf(stderr,
                  "ERROR: failed to merge alignments at position %ld\n"
                  "       Please file a bug report\n",
                  CurrAp->eA);
          exit (EXIT_FAILURE);
	}

      //-- Update the deltaApos
      for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
        CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
    }

  //-- Set the alignment coordinates
  CurrAp->eA = targetA;
  CurrAp->eB = targetB;

  return target_reached;
}


void extendClusters
     (vector<Cluster> & Clusters,
      const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)

     //  Connect all the matches in every cluster between sequences A and B.
     //  Also, extend alignments off of the front and back of each cluster to
     //  expand total alignment coverage. When these extensions encounter an
     //  adjacent cluster (in a consistent frame), fuse the two regions to
     //  create one single encompassing region. This routine will create
     //  alignment objects from these extensions and output the resulting
     //  delta information to the delta output file. Also, all coordintes
     //  for the clusters and the alignment regions are translated to reference
     //  the original DNA sequecne.

{
  //-- Sort the clusters (ascending) by their start coordinate in sequence A
  sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));

  //-- If no delta file is requested
  if ( ! DO_DELTA )
    return;


  bool target_reached = false;         // reached the adjacent match or cluster
                                       // the amino acid sequences for A and B
  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};

  long int Alen [7], Blen [7];         // the length of the amino acid sequences
  int i;

  unsigned int m_o;
  long int targetA, targetB;           // alignment extension targets in A and B

  vector<Match>::iterator Mp;          // match pointer

  vector<Cluster>::iterator PrevCp;    // where the extensions last left off
  vector<Cluster>::iterator CurrCp;    // the current cluster being extended
  vector<Cluster>::iterator TargetCp = Clusters.end( );   // the target cluster

  vector<Alignment> Alignments;        // the vector of alignment objects
  vector<Alignment>::iterator CurrAp = Alignments.begin( );   // current align
  vector<Alignment>::iterator TargetAp;                       // target align

  //-- Calculate the length of each amino acid frame translation
  for ( i = 0; i < 6; i ++ )
    {
      Alen [i + 1] = (Af->len - (i % 3)) / 3;
      Blen [i + 1] = (Bf->len - (i % 3)) / 3;
    }

  //-- Extend each cluster
  PrevCp = Clusters.begin( );
  CurrCp = Clusters.begin( );
  while ( CurrCp < Clusters.end( ) )
    {
      if ( DO_EXTEND )
	{
	  //-- Ignore if shadowed or already extended
	  if ( ! target_reached )
	    if ( CurrCp->wasFused  ||
		 isShadowedCluster (CurrCp, Alignments, CurrAp) )
	      {
		CurrCp->wasFused = true;
		CurrCp = ++ PrevCp;
		continue;
	      }
	}

      //-- Initialize the right amino acid sequence for A and B if need be
      if ( A [CurrCp->frameA] == NULL )
        {
	  A [CurrCp->frameA] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Af->len / 3) + 2) );
	  A [CurrCp->frameA] [0] = '\0';
	  Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA],
						  CurrCp->frameA );
        }
      if ( B [CurrCp->frameB] == NULL )
        {
	  B [CurrCp->frameB] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
	  B [CurrCp->frameB] [0] = '\0';
	  Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB],
						  CurrCp->frameB );
        }

      //-- Extend each match in the cluster
      for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
        {
          //-- Try to extend the current match backwards
          if ( target_reached )
            {
              //-- Merge with the previous match
              if ( CurrAp->eA != Mp->sA  ||  CurrAp->eB != Mp->sB )
                {
		  //-- Strip matches until the targeted match is found
                  assert (Mp < CurrCp->matches.end( ) - 1);
                  continue;
                }

              CurrAp->eA += Mp->len - 1;
              CurrAp->eB += Mp->len - 1;
            }
          else
            {
              //-- Create a new alignment object
              addNewAlignment (Alignments, CurrCp, Mp);
              CurrAp = Alignments.end( ) - 1;

	      if ( DO_EXTEND  ||  Mp != CurrCp->matches.begin ( ) )
		{
		  //-- Target the closest/best alignment object
		  TargetAp = getReverseTargetAlignment (Alignments, CurrAp);

		  //-- Extend the new alignment object backwards
		  if ( extendBackward (Alignments, CurrAp, TargetAp,
				       A [CurrCp->frameA], B [CurrCp->frameB]) )
		    CurrAp = TargetAp;
		}
            }

	  m_o = FORWARD_ALIGN;

          //-- Try to extend the current match forwards
          if ( Mp < CurrCp->matches.end( ) - 1 )
            {
              //-- Target the next match in the cluster
              targetA = (Mp + 1)->sA;
              targetB = (Mp + 1)->sB;

	      //-- Extend the current alignment object forward
	      target_reached = extendForward (CurrAp,
					      A [CurrCp->frameA], targetA,
					      B [CurrCp->frameB], targetB, m_o);
            }
          else if ( DO_EXTEND )
            {
              targetA = Alen [CurrCp->frameA];
              targetB = Blen [CurrCp->frameB];

              //-- Target the closest/best match in a future cluster
              TargetCp = getForwardTargetCluster (Clusters, CurrCp,
                                                  targetA, targetB);

	      if ( TargetCp == Clusters.end( ) )
		{
		  m_o |= OPTIMAL_BIT;
		  if ( TO_SEQEND )
		    m_o |= SEQEND_BIT;
		}

	      //-- Extend the current alignment object forward
	      target_reached = extendForward (CurrAp,
					      A [CurrCp->frameA], targetA,
					      B [CurrCp->frameB], targetB, m_o);
            }
        }

      if ( TargetCp == Clusters.end( ) )
        target_reached = false;

      CurrCp->wasFused = true;

      if ( target_reached == false )
        CurrCp = ++ PrevCp;
      else
	CurrCp = TargetCp;
    }

#ifdef _DEBUG_ASSERT
  validateData (Alignments, Clusters, Af, Bf);
#endif

  //-- Output the alignment data to the delta file
  flushAlignments (Alignments, Af, Bf, DeltaFile);

  for ( i = 0; i < 7; i ++ )
    {
      if ( A [i] != NULL )
	free ( A [i] );
      if ( B [i] != NULL )
	free ( B [i] );
    }

  return;
}


void flushAlignments
     (vector<Alignment> & Alignments,
      const FastaRecord * Af, const FastaRecord * Bf,
      FILE * DeltaFile)

     //  Simply output the delta information stored in Alignments to the
     //  given delta file. Free the memory used by Alignments once the
     //  data is successfully output to the file.

{
  vector<Alignment>::iterator Ap;       // alignment pointer
  vector<long int>::iterator Dp;             // delta pointer

  fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);

  //-- Generate the error counts
  parseDelta (Alignments, Af, Bf);

  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
    {
      fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
	       transC (Ap->sA, Ap->frameA, Af->len),
	       transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2),
	       transC (Ap->sB, Ap->frameB, Bf->len),
	       transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2),
	       Ap->Errors, Ap->SimErrors, Ap->NonAlphas);

      for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
        fprintf (DeltaFile, "%ld\n", *Dp);
      fprintf (DeltaFile, "0\n");

      Ap->delta.clear( );
    }

  Alignments.clear( );

  return;
}


void flushSyntenys
     (vector<Synteny> & Syntenys, FILE * ClusterFile)

     //  Simply output the synteny/cluster information generated by the mgaps
     //  program. However, now the coordinates reference their appropriate
     //  reference sequence, and the reference sequecne header is added to
     //  the appropriate lines. Free the memory used by Syntenys once the
     //  data is successfully output to the file.

{
  vector<Synteny>::iterator Sp;         // synteny pointer
  vector<Cluster>::iterator Cp;         // cluster pointer
  vector<Match>::iterator Mp;           // match pointer

  if ( ClusterFile )
    {
      for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
        {
          fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
                   Sp->AfP->len, Sp->Bf.len);
          for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
            {
              fprintf (ClusterFile, "%2d %2d\n",
                       Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA,
                       Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB);
              for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
                {
                  fprintf (ClusterFile, "%8ld %8ld %6ld",
                           transC (Mp->sA, Cp->frameA, Sp->AfP->len),
                           transC (Mp->sB, Cp->frameB, Sp->Bf.len),
                           Mp->len * 3);

                  if ( Mp != Cp->matches.begin( ) )
                    fprintf (ClusterFile, "%6ld %6ld\n",
                             (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3,
                             (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3);
                  else
                    fprintf (ClusterFile, "%6s %6s\n", "-", "-");
                }
              Cp->matches.clear( );
            }
          Sp->clusters.clear( );
        }
    }
  else
    {
      for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
        {
          for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
            {
              Cp->matches.clear( );
            }
          Sp->clusters.clear( );
        }
    }

  Syntenys.clear( );

  return;
}


vector<Cluster>::iterator getForwardTargetCluster
     (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
      long int & targetA, long int & targetB)

     //  Return the cluster that is most likely to successfully join (in a
     //  forward direction) with the current cluster. The returned cluster
     //  must contain 1 or more matches that are strictly greater than the end
     //  of the current cluster. The targeted cluster must also be on a
     //  diagonal close enough to the current cluster, so that a connection
     //  could possibly be made by the alignment extender. Also, this targeted
     //  cluster must be consistent in frame with the current cluster. Assumes
     //  clusters have been sorted via AscendingClusterSort. Returns targeted
     //  cluster and stores the target coordinates in targetA and targetB. If no
     //  suitable cluster was found, the function will return NULL and target
     //  A and targetB will remain unchanged.

{
  vector<Match>::iterator Mip;               // match iteratrive pointer
  vector<Cluster>::iterator Cp;              // cluster pointer
  vector<Cluster>::iterator Cip;             // cluster iterative pointer
  long int eA, eB;                           // possible target
  long int greater, lesser;                  // gap sizes between two clusters
  long int sA = CurrCp->matches.rbegin( )->sA +
    CurrCp->matches.rbegin( )->len - 1;      // the endA of the current cluster
  long int sB = CurrCp->matches.rbegin( )->sB +
    CurrCp->matches.rbegin( )->len - 1;      // the endB of the current cluster

  //-- End of sequences is the default target, set distance accordingly
  long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);

  //-- For all clusters greater than the current cluster (on sequence A)
  Cp = Clusters.end( );
  for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
    {
      //-- If the cluster is in the same frame
      if ( CurrCp->frameA == Cip->frameA  &&  CurrCp->frameB == Cip->frameB )
        {
          eA = Cip->matches.begin( )->sA;
          eB = Cip->matches.begin( )->sB;

          //-- If the cluster overlaps the current cluster, strip some matches
          if ( ( eA < sA  ||  eB < sB )  &&
              Cip->matches.rbegin( )->sA >= sA  &&
              Cip->matches.rbegin( )->sB >= sB )
            {
              for ( Mip = Cip->matches.begin( );
                    Mip < Cip->matches.end( )  &&  ( eA < sA  ||  eB < sB );
                    Mip ++ )
                {
                  eA = Mip->sA;
                  eB = Mip->sB;
                }
            }

          //-- If the cluster is strictly greater than current cluster
          if ( eA >= sA  &&  eB >= sB )
            {
              if ( eA - sA > eB - sB )
                {
                  greater = eA - sA;
                  lesser = eB - sB;
                }
              else
                {
                  lesser = eA - sA;
                  greater = eB - sB;
                }

              //-- If the cluster is close enough
              if ( greater <= getBreakLen( )  ||
                   (lesser) * GOOD_SCORE [getMatrixType( )] +
                   (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
                {
                  Cp = Cip;
                  targetA = eA;
                  targetB = eB;
                  break;
                }
              else if ( (greater << 1) - lesser < dist )
                {
                  Cp = Cip;
                  targetA = eA;
                  targetB = eB;
                  dist = (greater << 1) - lesser;
                }
            }
        }
    }


  return Cp;
}


vector<Alignment>::iterator getReverseTargetAlignment
     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)

     //  Return the alignment that is most likely to successfully join (in a
     //  reverse direction) with the current alignment. The returned alignment
     //  must be strictly less than the current cluster and be on a diagonal
     //  close enough to the current alignment, so that a connection
     //  could possibly be made by the alignment extender. Also, this targeted
     //  alignment must be consistent in frame with the current alignment.
     //  Assumes clusters have been sorted via AscendingClusterSort and
     //  processed in order, so therefore all alignments are in order by their
     //  start A coordinate.

{
  vector<Alignment>::iterator Ap;        // alignment pointer
  vector<Alignment>::iterator Aip;       // alignment iterative pointer
  long int eA, eB;                       // possible targets
  long int greater, lesser;              // gap sizes between the two alignments
  long int sA = CurrAp->sA;              // the startA of the current alignment
  long int sB = CurrAp->sB;              // the startB of the current alignment

  //-- Beginning of sequences is the default target, set distance accordingly
  long int dist = (sA < sB ? sA : sB);

  //-- For all alignments less than the current alignment (on sequence A)
  Ap = Alignments.end( );
  for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
    {
      //-- If the alignment is on the same direction
      if ( CurrAp->frameA == Aip->frameA  &&  CurrAp->frameB == Aip->frameB )
        {
          eA = Aip->eA;
          eB = Aip->eB;

          //-- If the alignment is strictly greater than current cluster
          if ( eA <= sA  && eB <= sB )
            {
              if ( sA - eA > sB - eB )
                {
                  greater = sA - eA;
                  lesser = sB - eB;
                }
              else
                {
                  lesser = sA - eA;
                  greater = sB - eB;
                }

              //-- If the cluster is close enough
              if ( greater <= getBreakLen( )  ||
                   (lesser) * GOOD_SCORE [getMatrixType( )] +
                   (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
                {
                  Ap = Aip;
                  break;
                }
              else if ( (greater << 1) - lesser < dist )
                {
                  Ap = Aip;
                  dist = (greater << 1) - lesser;
                }
            }
        }
    }


  return Ap;
}


bool isShadowedCluster
     (vector<Cluster>::iterator CurrCp,
      vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)

     //  Check if the current cluster is shadowed by a previously produced
     //  alignment region. Return true if it is, else false.

{
  vector<Alignment>::iterator Aip;         // alignment pointer

  long int sA = CurrCp->matches.begin( )->sA;
  long int eA = CurrCp->matches.rbegin( )->sA +
                CurrCp->matches.rbegin( )->len - 1;
  long int sB = CurrCp->matches.begin( )->sB;
  long int eB = CurrCp->matches.rbegin( )->sB +
                CurrCp->matches.rbegin( )->len - 1;

  if ( ! Alignments.empty( ) )             // if there are alignments to use
    {
      //-- Look backwards in hope of finding a shadowing alignment
      for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
        {
          //-- If in the same frames and shadowing the current cluster, break
          if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB )
            if ( Aip->eA >= eA  &&  Aip->eB >= eB )
              if ( Aip->sA <= sA  &&  Aip->sB <= sB )
                break;
        }

      //-- Return true if the loop was broken, i.e. shadow found
      if ( Aip >= Alignments.begin( ) )
        return true;
    }

  //-- Return false if Alignments was empty or loop was not broken
  return false;
}


void parseAbort
     (const char * s)

     //  Abort the program if there was an error in parsing file 's'

{
  fprintf (stderr,
          "ERROR: Could not parse input from '%s'. \n"
          "Please check the filename and format, or file a bug report\n", s);
  exit (EXIT_FAILURE);
}


void parseDelta
     (vector<Alignment> & Alignments,
      const FastaRecord * Af, const FastaRecord *Bf)

     // Use the delta information to generate the error counts for each
     // alignment, and fill this information into the data type

{
  //  pointers to Af.seq and Bf.seq
  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};

  char ch1, ch2;
  long int Delta;
  int Sign;
  long int i, Apos, Bpos;
  long int Remain, Total;
  long int Errors, SimErrors;
  long int NonAlphas;
  vector<Alignment>::iterator Ap;
  vector<long int>::iterator Dp;

  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
    {
      if ( A [Ap->frameA] == NULL )
	{
	  A [Ap->frameA] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Af->len / 3) + 2 ) );
	  A [Ap->frameA] [0] = '\0';
	  Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA );
	}
      if ( B [Ap->frameB] == NULL )
	{
	  B [Ap->frameB] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
	  B [Ap->frameB] [0] = '\0';
	  Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB );
	}

      Apos = Ap->sA;
      Bpos = Ap->sB;

      Errors = 0;
      SimErrors = 0;
      NonAlphas = 0;
      Remain = Ap->eA - Ap->sA + 1;
      Total = Remain;

      //-- For all delta's in this alignment
      for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
	{
	  Delta = *Dp;
	  Sign = Delta > 0 ? 1 : -1;
	  Delta = labs ( Delta );

	  //-- For all the bases before the next indel
	  for ( i = 1; i < Delta; i ++ )
	    {
	      ch1 = A [Ap->frameA] [Apos ++];
	      ch2 = B [Ap->frameB] [Bpos ++];

	      if ( !isalpha (ch1) )
		{
		  ch1 = STOP_CHAR;
		  NonAlphas ++;
		}
	      if ( !isalpha (ch2) )
		{
		  ch2 = STOP_CHAR;
		  NonAlphas ++;
		}

	      ch1 = toupper(ch1);
	      ch2 = toupper(ch2);
	      if ( 1 > MATCH_SCORE
		   [getMatrixType( )]
		   [ch1 - 'A']
		   [ch2 - 'A'] )
		SimErrors ++;
	      if ( ch1 != ch2 )
		Errors ++;
	    }

	  //-- Process the current indel
	  Remain -= i - 1;
	  Errors ++;
	  SimErrors ++;

	  if ( Sign == 1 )
	    {
	      if ( !isalpha (A [Ap->frameA] [Apos ++]) )
		NonAlphas ++;
	      Remain --;
	    }
	  else
	    {
	      if ( !isalpha (B [Ap->frameB] [Bpos ++]) )
		NonAlphas ++;
	      Total ++;
	    }
	}

      //-- For all the bases after the final indel
      for ( i = 0; i < Remain; i ++ )
	{
	  //-- Score character match and update error counters
	  ch1 = A [Ap->frameA] [Apos ++];
	  ch2 = B [Ap->frameB] [Bpos ++];

	  if ( !isalpha (ch1) )
	    {
	      ch1 = STOP_CHAR;
	      NonAlphas ++;
	    }
	  if ( !isalpha (ch2) )
	    {
	      ch2 = STOP_CHAR;
	      NonAlphas ++;
	    }

	  ch1 = toupper(ch1);
	  ch2 = toupper(ch2);
	  if ( 1 > MATCH_SCORE
	       [getMatrixType( )]
	       [ch1 - 'A']
	       [ch2 - 'A'] )
	    SimErrors ++;
	  if ( ch1 != ch2 )
	    Errors ++;
	}

      Ap->Errors = Errors;
      Ap->SimErrors = SimErrors;
      Ap->NonAlphas = NonAlphas;
    }

  for ( i = 1; i <= 6; i ++ )
    {
      if ( A [i] != NULL )
        free ( A [i] );
      A [i] = NULL;

      if ( B [i] != NULL )
        free ( B [i] );
      B [i] = NULL;
    }

  return;
}


void processSyntenys
     (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
      FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)

     //  For each syntenic region with clusters, read in the B sequence and
     //  extend the clusters to expand total alignment coverage. Clusters should
     //  still reference the amino acid concatenations, the translation back to
     //  DNA will be taken care of by the extendClusters function.
     //      processSyntenys should ONLY be called once all the clusters for
     //  the contained syntenic regions have been stored in the data structure.
     //  Frees the memory used by the the syntenic regions once the output of
     //  extendClusters and flushSyntenys has been produced.

{
  FastaRecord Bf;                     // the current B sequence
  Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));

  long int InitSize = INIT_SIZE;      // the initial memory capacity of B
  vector<Synteny>::iterator CurrSp;   // current synteny pointer

  //-- Initialize the B sequence storage
  Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );

  //-- For all the contained syntenys
  for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
    {
      //-- If no clusters, ignore
      if ( CurrSp->clusters.empty( ) )
	continue;

      //-- If a B sequence not seen yet, read it in
      //-- IMPORTANT: The B sequences in the synteny object are assumed to be
      //      ordered as output by mgaps, if they are not in order the program
      //      will fail. (All like tags must be adjacent and in the same order
      //      as the query file)
      if ( CurrSp == Syntenys.begin( )  ||
	   strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
	{
	  //-- Read in the B sequence
	  while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
	    if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
	      break;
	  if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
	    {
	      fprintf(stderr,"%s\n", CurrSp->Bf.Id);
	      parseAbort ( "Query File" );
	    }
	  Bf.len = strlen (Bf.seq + 1);
	}

      //-- Extend clusters and create the alignment information
      CurrSp->Bf.len = Bf.len;
      extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
    }

  //-- Create the cluster information and clear the Synteny structure
  flushSyntenys (Syntenys, ClusterFile);

  free (Bf.Id);
  free (Bf.seq);

  return;
}


inline long int transC
     (long int Coord, int Frame, long int Len)

     //  Translate an amino acid coordinate to a nucleotide coordinate
     //  relative to the forward strand. The Len parameter should be the
     //  length of the original DNA strand.

{
  assert ( Frame > 0 && Frame < 7 );
  if ( Frame > 3 )
    return revC ( (Coord * 3) - (3 - (Frame - 3)), Len );
  else
    return (Coord * 3) - (3 - (Frame));
}


inline long int refLen
     (long int ntLen)

     //  Return the length of the concatenated amino acid sequence frames,
     //  including the appended 'X' at the end of each sequence frame for the
     //  given DNA input length. (The returned length will be the length of
     //  the concatenated frames for this sequence as output by prepro.cc)

{
  return (ntLen + 1) << 1;
}


inline long int revC
     (long int Coord, long int Len)

     //  Reverse complement the given coordinate for the given length.

{
  assert (Len - Coord + 1 > 0);
  return (Len - Coord + 1);
}


void printHelp
     (const char * s)

     //  Display the program's help information to stderr.

{
  fprintf (stderr,
    "\nUSAGE: %s  [options]  <reference>  <query>  <pfx>  <  <input>\n\n", s);
  fprintf (stderr,
     "-b int   set the alignment break (give-up) length to int (amino acids)\n"
     "-d       output only match clusters rather than extended alignments\n"
     "-e       do not extend alignments outward from clusters\n"
     "-h       display help information\n"
     "-t       force alignment to ends of sequence if within -b distance\n"
     "-x type  set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n"
     "         other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n");
  fprintf (stderr,
          "  Input is the output of the \"mgaps\" program from stdin, and\n"
          "the two original PROmer sequence files passed on the command\n"
          "line. <pfx> is the prefix to be added to the front of the\n"
          "output file <pfx>.delta\n"
          "  <pfx>.delta is the alignment object that catalogs the distance\n"
          "between insertions and deletions. For further information\n"
          "regarding this file, please refer to the documentation under\n"
          "the .delta output description.\n\n");
  return;
}


void printUsage
     (const char * s)

     //  Display the program's usage information to stderr.

{
  fprintf (stderr,
    "\nUSAGE: %s  [options]  <reference>  <query>  <pfx>  <  <input>\n\n", s);
  fprintf (stderr, "Try '%s -h' for more information.\n", s);
  return;
}


void validateData
     (vector<Alignment> Alignments, vector<Cluster> Clusters,
      const FastaRecord * Af, const FastaRecord * Bf)

     //  Self test function to check that the cluster and alignment information
     //  is valid

{
  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};

  long int Alen [7], Blen [7];         // the length of the amino acid sequences
  vector<Cluster>::iterator Cp;
  vector<Match>::iterator Mp;
  vector<Alignment>::iterator Ap;
  long int x, y, i;
  char Xc, Yc;

  for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
    {
      assert ( Cp->wasFused );

      //-- Initialize the right amino acid sequence for A and B if need be
      if ( A [Cp->frameA] == NULL )
        {
	  A [Cp->frameA] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Af->len / 3) + 2) );
	  A [Cp->frameA] [0] = '\0';
	  Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA],
						  Cp->frameA );
        }
      if ( B [Cp->frameB] == NULL )
        {
	  B [Cp->frameB] = (char *) Safe_malloc
	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
	  B [Cp->frameB] [0] = '\0';
	  Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB],
						  Cp->frameB );
        }

      for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
	{
	  //-- assert for each match in cluster, it is indeed a match
	  x = Mp->sA;
	  y = Mp->sB;
	  for ( i = 0; i < Mp->len; i ++ )
	    assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] );

	  //-- assert for each match in cluster, it is contained in an alignment
	  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
	    {
	      if ( Ap->sA <= Mp->sA  &&  Ap->sB <= Mp->sB  &&
                   Ap->eA >= Mp->sA + Mp->len - 1  &&
                   Ap->eB >= Mp->sB + Mp->len - 1 )
                break;
            }
          assert ( Ap < Alignments.end( ) );
	}
    }

  //-- assert alignments are optimal (quick check if first and last chars equal)
  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
    {
      assert ( Ap->sA <= Ap->eA );
      assert ( Ap->sB <= Ap->eB );

      assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
      assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
      assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
      assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );

      Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ?
		   A[Ap->frameA][Ap->sA] : STOP_CHAR);
      Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ?
		   B[Ap->frameB][Ap->sB] : STOP_CHAR);
      assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );

      Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ?
		   A[Ap->frameA][Ap->eA] : STOP_CHAR);
      Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ?
		   B[Ap->frameB][Ap->eB] : STOP_CHAR);
      assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
    }

  for ( i = 0; i < 7; i ++ )
    {
      if ( A [i] != NULL )
	free ( A [i] );
      if ( B [i] != NULL )
	free ( B [i] );
    }

  return;
}
author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children