Mercurial > repos > rliterman > csp2

diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author: jpayne
date: Tue, 18 Mar 2025 17:55:14 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,1665 @@
+//------------------------------------------------------------------------------
+//   Programmer: Adam M Phillippy, The Institute for Genomic Research
+//         File: postpro.cc
+//         Date: 08 / 20 / 2002
+//
+//      Purpose: To translate the coordinates referencing the concatenated
+//              reference sequences back to the individual sequences and deal
+//             with any conflict that may have arisen (i.e. a MUM that spans
+//            the boundry between two sequences). Then to extend each cluster
+//           via Smith-Waterman techniques to expand the alignment coverage.
+//          Alignments which encounter each other will be fused to form one
+//         encompasing alignment when appropriate.
+//
+//        Input: Input is the output of the .mgaps program from stdin. On the
+//              command line, the file names of the two original sequence files
+//             should come first, followed by the prefix <pfx> that should be
+//            placed in front of the two output filenames <pfx>.cluster and
+//           <pfx>.delta
+//
+// NOTE: Cluster file is now suppressed by default (see -d option).
+//
+//       Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
+//              <pfx>.cluster lists MUM clusters as identified by "mgaps".
+//             However, the clusters now reference their corresponding
+//            sequence and are all listed under headers that specify this
+//           sequence. In addition, the coordinates now reference the original
+//          DNA input and not the amino acid translations. The <pfx>.delta file
+//         is the alignment object that contains all the information necessary
+//        to reproduce the alignments generated by the MUM extension process.
+//       The coordinates in this file reference the original DNA input, however
+//      the delta values represent amino acids, so 1 delta = 3 nucleotides.
+//     Please refer to the output file documentation for an in-depth
+//    description of these file formats.
+//
+//        Usage: postpro  <reference>  <query>  <pfx>  <  <input>
+//           Where <reference> and <query> are the original input sequences of
+//          PROmer and <pfx> is the prefix that should be added to the
+//         beginning of the <pfx>.cluster and <pfx>.delta output filenames.
+//
+//------------------------------------------------------------------------------
+
+//-- NOTE: this option will significantly hamper program performance,
+//         mostly the alignment extension performance (sw_align.h)
+//#define _DEBUG_ASSERT       // self testing assert functions
+
+#include "tigrinc.hh"
+#include "sw_align.hh"
+#include "translate.hh"
+#include <vector>
+#include <algorithm>
+using namespace std;
+
+
+
+
+//------------------------------------------------------ Globals -------------//
+bool DO_DELTA = true;
+bool DO_EXTEND = true;
+bool TO_SEQEND = false;
+
+
+//------------------------------------------------------ Type Definitions ----//
+enum LineType
+     //-- The type of input line from <stdin>
+{
+  NO_LINE, HEADER_LINE, MATCH_LINE
+};
+
+
+struct FastaRecord
+     //-- The essential data of a sequence
+{
+  char * Id;                 // the fasta ID header tag
+  long int len;              // the length of the sequence
+  char * seq;                // the sequence data
+};
+
+
+struct Match
+     //-- An exact match between two sequences A and B
+{
+  long int sA, sB, len;      // start coordinate in A, in B and the length
+};
+
+
+struct Cluster
+     //-- An ordered list of matches between two sequences A and B
+{
+  bool wasFused;          // have the cluster matches been extended yet?
+  int frameA;                // the reference sequence frame (1-6)
+  int frameB;                // the query sequence frame
+  vector<Match> matches;     // the ordered set of matches in the cluster
+};
+
+
+struct Synteny
+     //-- An ordered list of clusters between two sequences A and B
+{
+  FastaRecord * AfP;         // a pointer to the reference sequence record
+  FastaRecord Bf;            // the query sequence record (w/o the sequence)
+  vector<Cluster> clusters;  // the ordered set of clusters between A and B
+};
+
+
+struct Alignment
+     //-- An alignment object between two sequences A and B
+{
+  int frameA;                // the reference sequence frame (1-6)
+  int frameB;                // the query sequence frame
+  long int sA, sB, eA, eB;   // the start in A, B and the end in A, B
+  vector<long int> delta;         // the delta values, with NO zero at the end
+  long int deltaApos;        // sum of abs(deltas) - #of negative deltas
+                             //      trust me, it is a very helpful value
+  long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
+};
+
+
+struct AscendingClusterSort
+     //-- For sorting clusters in ascending order of their sA coordinate
+{
+  bool operator() (const Cluster & pA, const Cluster & pB)
+  {
+    return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
+  }
+};
+
+
+
+
+
+//------------------------------------------------- Function Declarations ----//
+void addNewAlignment
+     (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
+      vector<Match>::iterator Mp);
+
+bool extendBackward
+     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
+      vector<Alignment>::iterator TargetAp, const char * A, const char * B);
+
+bool extendForward
+     (vector<Alignment>::iterator Ap, const char * A, long int targetA,
+      const char * B, long int targetB, unsigned int m_o);
+
+void extendClusters
+     (vector<Cluster> & Clusters,
+      const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile);
+
+void flushAlignments
+     (vector<Alignment> & Alignments,
+      const FastaRecord * Af, const FastaRecord * Bf,
+      FILE * DeltaFile);
+
+void flushSyntenys
+     (vector<Synteny> & Syntenys, FILE * ClusterFile);
+
+vector<Cluster>::iterator getForwardTargetCluster
+     (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
+      long int & targetA, long int & targetB);
+
+vector<Alignment>::iterator getReverseTargetAlignment
+     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
+
+bool isShadowedCluster
+     (vector<Cluster>::iterator CurrCp,
+      vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
+
+void parseAbort
+     (const char *);
+
+void parseDelta
+     (vector<Alignment> & Alignments,
+      const FastaRecord * Af, const FastaRecord *Bf);
+
+void processSyntenys
+     (vector<Synteny> & Syntenys,
+      FastaRecord * Af, long int As,
+      FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
+
+inline long int transC
+     (long int Coord, int Frame, long int Len);
+
+inline long int refLen
+     (long int ntLen);
+
+inline long int revC
+     (long int Coord, long int Len);
+
+void printHelp
+     (const char *);
+
+void printUsage
+     (const char *);
+
+void validateData
+     (vector<Alignment> Alignments, vector<Cluster> Clusters,
+      const FastaRecord * Af, const FastaRecord * Bf);
+
+
+
+//-------------------------------------------------- Function Definitions ----//
+int main
+     (int argc, char ** argv)
+{
+  FastaRecord * Af;           // array of all the reference sequences
+
+  vector<Synteny> Syntenys;                  // vector of all sets of clusters
+  vector<Synteny>::reverse_iterator CurrSp;  // current set of clusters
+  vector<Synteny>::reverse_iterator Sp;      // temporary synteny pointer
+
+  Synteny Asyn;               // a single set of clusters
+  Cluster Aclu;               // a single cluster of matches
+  Match Amat;                 // a single match
+
+  LineType PrevLine;          // the current input line
+
+  bool Found;                 // temporary search flag
+
+  char Line [MAX_LINE];       // a single line of input
+  char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE];   // fasta ID headers
+  char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
+  char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
+
+  int FrameA, FrameB;                // reading frames (1-6)
+  int CurrFrameA, CurrFrameB;
+
+  long int i;                        // temporary counter
+  long int Seqi;                     // current reference sequence index
+  long int InitSize;                 // initial sequence memory space
+  long int As = 0;                   // the size of the reference array
+  long int Ac = 100;                 // the capacity of the reference array
+  long int sA, sB, len;              // current match start in A, B and length
+
+  FILE * RefFile, * QryFile;         // reference and query input files
+  FILE * ClusterFile, * DeltaFile;   // cluster and delta output files
+
+  //-- Set the alignment data type and break length (sw_align.h)
+  setMatrixType ( BLOSUM62 );
+  setBreakLen ( 60 );
+
+  //-- Parse the command line arguments
+  {
+    optarg = NULL;
+    int ch, errflg = 0;
+    while ( !errflg  &&  ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) )
+      switch (ch)
+        {
+        case 'b' :
+          setBreakLen (atoi (optarg));
+          break;
+
+	case 'd' :
+	  DO_DELTA = false;
+	  break;
+
+	case 'e' :
+	  DO_EXTEND = false;
+	  break;
+
+        case 'h' :
+          printHelp (argv[0]);
+          exit (EXIT_SUCCESS);
+          break;
+
+	case 't' :
+	  TO_SEQEND = true;
+	  break;
+
+	case 'x' :
+	  setMatrixType ( atoi (optarg) );
+	  if ( getMatrixType( ) == NUCLEOTIDE )
+	    {
+	      fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n",
+		       getMatrixType( ));
+	      setMatrixType ( BLOSUM62 );
+	    }
+	  break;
+
+        default :
+          errflg ++;
+        }
+    
+    if ( errflg > 0 || argc - optind != 3 )
+      {
+        printUsage (argv[0]);
+        exit (EXIT_FAILURE);
+      }
+  }
+
+  //-- Read and create the I/O file names
+  strcpy (RefFileName, argv[optind ++]);
+  strcpy (QryFileName, argv[optind ++]);
+  strcpy (ClusterFileName, argv[optind ++]);
+  strcpy (DeltaFileName, ClusterFileName);
+  strcat (ClusterFileName, ".cluster");
+  strcat (DeltaFileName, ".delta");
+
+  //-- Open all the files
+  RefFile = File_Open (RefFileName, "r");
+  QryFile = File_Open (QryFileName, "r");
+  if ( DO_DELTA )
+    {
+      ClusterFile = NULL;
+      DeltaFile = File_Open (DeltaFileName, "w");
+    }
+  else
+    {
+      ClusterFile = File_Open (ClusterFileName, "w");
+      DeltaFile = NULL;
+    }
+
+  //-- Print the headers of the output files
+  if ( DO_DELTA )
+    fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
+  else
+    fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
+
+  //-- Generate the array of the reference sequences
+  InitSize = INIT_SIZE;
+  Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
+  Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
+  while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
+    {
+      Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
+      strcpy (Af[As].Id, IdA);
+
+      Af[As].len = strlen (Af[As].seq + 1);
+
+      if ( ++ As >= Ac )
+        {
+          Ac *= 2;
+          Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
+        }
+
+      InitSize = INIT_SIZE;
+      Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
+    }
+  fclose (RefFile);
+  if ( As <= 0 )
+    parseAbort ( RefFileName );
+
+
+
+  //-- Process the input from <stdin> line by line
+  PrevLine = NO_LINE;
+  IdA[0] = '\0';
+  IdB[0] = '\0';
+  FrameA = 0;
+  FrameB = 0;
+  CurrFrameA = -1;
+  CurrFrameB = -1;
+  CurrIdB[0] = '\0';
+  while ( fgets(Line, MAX_LINE, stdin) != NULL )
+    {
+
+      //-- If the current line is a fasta HEADER_LINE
+      if ( Line[0] == '>' )
+        {
+          if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
+            parseAbort ("stdin");
+
+	  sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB);
+	  CurrIdB [strlen(CurrIdB) - 2] = '\0';
+
+          PrevLine = HEADER_LINE;
+        }
+
+
+      //-- If the current line is a cluster HEADER_LINE
+      else if ( Line[0] == '#' )
+        {
+          PrevLine = HEADER_LINE;
+        }
+
+
+      //-- If the current line is a MATCH_LINE
+      else
+        {
+          if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
+            parseAbort ("stdin");
+
+          //-- Re-map the reference coordinate back to its original sequence
+	  //   NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length
+	  //   including the appended 'X' on each frame translation
+          for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ )
+	    sA -= refLen (Af[Seqi].len);
+
+	  assert ( Seqi < As );
+
+	  //-- Get the correct frame, translate startA coordinate to frame
+	  for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ )
+	    sA -= (Af[Seqi].len - (i % 3)) / 3 + 1;
+	  CurrFrameA = i + 1;
+	  assert ( CurrFrameA > 0  &&  CurrFrameA < 7 );
+
+	  //-- If the match spans across a frame boundry
+	  if ( CurrFrameA < 1  ||  CurrFrameA > 6  ||
+	       sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 ||
+	       sA <= 0 )
+	    {
+	      fprintf (stderr,
+                 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
+                 "         Reference sequence '>%s', frame %d\n"
+                 "Please file a bug report\n"
+                 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA);
+              continue;
+	    }
+
+          //-- If the match spans across a sequence boundry
+          if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 )
+            {
+              fprintf (stderr,
+                 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
+                 "         Reference sequence '>%s'\n"
+                 "Please file a bug report\n"
+                 "Attempting to continue.\n", Af[Seqi].Id);
+              continue;
+            }
+
+          //-- Check and update the current synteny region
+          if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ||
+	       CurrFrameA != FrameA || CurrFrameB != FrameB )
+            {
+              Found = false;
+              if ( strcmp (IdB, CurrIdB) == 0 )
+                {
+                  //-- Has this header been seen before?
+                  for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
+                    {
+                      if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
+                        {
+                          assert ( Sp->AfP->len == Af[Seqi].len );
+                          assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
+                          CurrSp = Sp;
+                          Found = true;
+                          break;
+                        }
+                    }
+                }
+              else
+                {
+                  //-- New B sequence header, process all the old synteny's
+                  processSyntenys (Syntenys, Af, As,
+                                   QryFile, ClusterFile, DeltaFile);
+
+                }
+              
+              strcpy (IdA, Af[Seqi].Id);
+              strcpy (IdB, CurrIdB);
+	      FrameA = CurrFrameA;
+	      FrameB = CurrFrameB;
+
+              //-- If not seen yet, create a new synteny region
+              if ( ! Found )   
+                {
+                  Asyn.AfP = Af + Seqi;
+                  Asyn.Bf.len = -1;
+		  Asyn.Bf.Id = (char *) Safe_malloc
+		    (sizeof(char) * (strlen(IdB) + 1));
+                  strcpy (Asyn.Bf.Id, IdB);
+
+                  Syntenys.push_back (Asyn);
+                  CurrSp = Syntenys.rbegin( );
+                }
+
+              //-- Add a new cluster to the current synteny
+              if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
+		if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
+		  CurrSp->clusters.pop_back( ); // hack to remove empties
+	      Aclu.frameA = FrameA;
+	      Aclu.frameB = FrameB;
+              Aclu.wasFused = false;
+              CurrSp->clusters.push_back (Aclu);
+            }
+          else if ( PrevLine == HEADER_LINE )
+            {
+              //-- Add a new cluster to the current synteny
+              if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
+		if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
+		  CurrSp->clusters.pop_back( );
+	      Aclu.frameA = FrameA;
+	      Aclu.frameB = FrameB;
+              Aclu.wasFused = false;
+              CurrSp->clusters.push_back (Aclu);
+            }
+
+          //-- Add a new match to the current cluster
+	  //   NOTE: A and B coordinates still reference the appropriate
+	  //   amino acid sequence frame, not the DNA (same with len)
+	  if ( len > 1 )
+	    {
+	      Amat.sA = sA;
+	      Amat.sB = sB;
+	      Amat.len = len;
+	      CurrSp->clusters.rbegin( )->matches.push_back (Amat);
+	    }
+
+          PrevLine = MATCH_LINE;
+        }
+    }
+
+  //-- Process the left-over syntenys
+  if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
+    if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
+      CurrSp->clusters.pop_back( );
+  processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
+  fclose (QryFile);
+
+  //-- Free the reference sequences
+  for ( i = 0; i < As; i ++ )
+    {
+      free (Af[i].Id);
+      free (Af[i].seq);
+    }
+  free (Af);
+
+  return EXIT_SUCCESS;
+}
+
+
+
+
+void addNewAlignment
+     (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
+      vector<Match>::iterator Mp)
+
+     //  Create and add a new alignment object based on the current match
+     //  and cluster information pointed to by Cp and Mp.
+
+{
+  Alignment Align;
+
+  //-- Initialize the data
+  Align.sA = Mp->sA;
+  Align.sB = Mp->sB;
+  Align.eA = Mp->sA + Mp->len - 1;
+  Align.eB = Mp->sB + Mp->len - 1;
+  Align.frameA = Cp->frameA;
+  Align.frameB = Cp->frameB;
+  Align.delta.clear( );
+  Align.deltaApos = 0;
+
+  //-- Add the alignment object
+  Alignments.push_back (Align);
+
+  return;
+}
+
+
+
+
+bool extendBackward
+     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
+      vector<Alignment>::iterator TargetAp, const char * A, const char * B)
+
+     //  Extend an alignment backwards off of the current alignment object.
+     //  The current alignment object must be freshly created and consist
+     //  only of an exact match (i.e. the delta vector MUST be empty).
+     //  If the TargetAp alignment object is reached by the extension, it will
+     //  be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
+     //  NULL the function will extend as far as possible. It is a strange
+     //  and dangerous function because it can delete CurrAp, so edit with
+     //  caution. Returns true if TargetAp was reached and merged, else false
+     //  Designed only as a subroutine for extendClusters, should be used
+     //  nowhere else.
+
+{
+  bool target_reached = false;
+  bool overflow_flag = false;
+  bool double_flag = false;
+
+  vector<long int>::iterator Dp;
+
+  unsigned int m_o;
+  long int targetA, targetB;
+
+  m_o = BACKWARD_SEARCH;
+
+  //-- Set the target coordinates
+  if ( TargetAp != Alignments.end( ) )
+    {
+      targetA = TargetAp->eA;
+      targetB = TargetAp->eB;
+    }
+  else
+    {
+      targetA = 1;
+      targetB = 1;
+      m_o |= OPTIMAL_BIT;
+    }
+
+  //-- If alignment is too long, bring with bounds and set overflow_flag true
+  if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
+    {
+      targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
+      overflow_flag = true;
+      m_o |= OPTIMAL_BIT;
+    }
+  if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
+    {
+      targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
+      if ( overflow_flag )
+	double_flag = true;
+      else
+	overflow_flag = true;
+      m_o |= OPTIMAL_BIT;
+    }
+
+
+  if ( TO_SEQEND && !double_flag )
+    m_o |= SEQEND_BIT;
+
+
+  //-- Attempt to reach the target
+  target_reached = alignSearch (A, CurrAp->sA, targetA,
+                                B, CurrAp->sB, targetB, m_o);
+
+  if ( overflow_flag  ||  TargetAp == Alignments.end( ) )
+    target_reached = false;
+
+  if ( target_reached )
+    {
+      //-- assert that CurrAp is new and at the end of the Alignment vector
+      assert ( CurrAp->delta.empty( ) );
+      assert ( CurrAp == Alignments.end( ) - 1 );
+
+      //-- Merge the two alignment objects
+      extendForward (TargetAp, A, CurrAp->sA,
+                               B, CurrAp->sB, FORCED_FORWARD_ALIGN);
+      TargetAp->eA = CurrAp->eA;
+      TargetAp->eB = CurrAp->eB;
+      Alignments.pop_back( );
+    }
+  else
+    {
+      alignTarget (A, targetA, CurrAp->sA,
+                   B, targetB, CurrAp->sB,
+                   CurrAp->delta, FORCED_FORWARD_ALIGN);
+      CurrAp->sA = targetA;
+      CurrAp->sB = targetB;
+
+      //-- Update the deltaApos value for the alignment object
+      for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
+        CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
+    }
+
+  return target_reached;
+}
+
+
+
+
+bool extendForward
+     (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
+      const char * B, long int targetB, unsigned int m_o)
+
+     //  Extend an alignment forwards off the current alignment object until
+     //  target or end of sequence is reached, and merge the delta values of the
+     //  alignment object with the new delta values generated by the extension.
+     //  Return true if the target was reached, else false
+
+{
+  long int ValA;
+  long int ValB;
+  unsigned int Di;
+  bool target_reached;
+  bool overflow_flag = false;
+  bool double_flag = false;
+  vector<long int>::iterator Dp;
+
+  //-- Set Di to the end of the delta vector
+  Di = CurrAp->delta.size( );
+
+  //-- Calculate the distance between the start and end positions
+  ValA = targetA - CurrAp->eA + 1;
+  ValB = targetB - CurrAp->eB + 1;
+
+  //-- If the distance is too long, shrink it and set the overflow_flag
+  if ( ValA > MAX_ALIGNMENT_LENGTH )
+    {
+      targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
+      overflow_flag = true;
+      m_o |= OPTIMAL_BIT;
+    }
+  if ( ValB > MAX_ALIGNMENT_LENGTH )
+    {
+      targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
+      if ( overflow_flag )
+	double_flag = true;
+      else
+	overflow_flag = true;
+      m_o |= OPTIMAL_BIT;
+    }
+
+  if ( double_flag )
+    m_o &= ~SEQEND_BIT;
+
+  target_reached = alignTarget (A, CurrAp->eA, targetA,
+                                B, CurrAp->eB, targetB,
+                                CurrAp->delta, m_o);
+
+  //-- Notify user if alignment was chopped short
+  if ( target_reached  &&  overflow_flag )
+    target_reached = false;
+
+  //-- Pick up delta where left off (Di) and merge with new deltas
+  if ( Di < CurrAp->delta.size( ) )
+    {
+      //-- Merge the deltas
+      ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
+      CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
+      if ( CurrAp->delta[Di] == 0  ||  ValA < 0 )
+	{
+	  fprintf(stderr,
+                  "ERROR: failed to merge alignments at position %ld\n"
+                  "       Please file a bug report\n",
+                  CurrAp->eA);
+          exit (EXIT_FAILURE);
+	}
+
+      //-- Update the deltaApos
+      for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
+        CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
+    }
+
+  //-- Set the alignment coordinates
+  CurrAp->eA = targetA;
+  CurrAp->eB = targetB;
+
+  return target_reached;
+}
+
+
+
+
+void extendClusters
+     (vector<Cluster> & Clusters,
+      const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
+
+     //  Connect all the matches in every cluster between sequences A and B.
+     //  Also, extend alignments off of the front and back of each cluster to
+     //  expand total alignment coverage. When these extensions encounter an
+     //  adjacent cluster (in a consistent frame), fuse the two regions to
+     //  create one single encompassing region. This routine will create
+     //  alignment objects from these extensions and output the resulting
+     //  delta information to the delta output file. Also, all coordintes
+     //  for the clusters and the alignment regions are translated to reference
+     //  the original DNA sequecne.
+
+{
+  //-- Sort the clusters (ascending) by their start coordinate in sequence A
+  sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
+
+  //-- If no delta file is requested
+  if ( ! DO_DELTA )
+    return;
+
+
+  bool target_reached = false;         // reached the adjacent match or cluster
+                                       // the amino acid sequences for A and B
+  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+  
+  long int Alen [7], Blen [7];         // the length of the amino acid sequences
+  int i;
+
+  unsigned int m_o;
+  long int targetA, targetB;           // alignment extension targets in A and B
+
+  vector<Match>::iterator Mp;          // match pointer
+
+  vector<Cluster>::iterator PrevCp;    // where the extensions last left off
+  vector<Cluster>::iterator CurrCp;    // the current cluster being extended
+  vector<Cluster>::iterator TargetCp = Clusters.end( );   // the target cluster
+
+  vector<Alignment> Alignments;        // the vector of alignment objects
+  vector<Alignment>::iterator CurrAp = Alignments.begin( );   // current align
+  vector<Alignment>::iterator TargetAp;                       // target align
+
+  //-- Calculate the length of each amino acid frame translation
+  for ( i = 0; i < 6; i ++ )
+    {
+      Alen [i + 1] = (Af->len - (i % 3)) / 3;
+      Blen [i + 1] = (Bf->len - (i % 3)) / 3;
+    }
+
+  //-- Extend each cluster
+  PrevCp = Clusters.begin( );
+  CurrCp = Clusters.begin( );
+  while ( CurrCp < Clusters.end( ) )
+    {
+      if ( DO_EXTEND )
+	{
+	  //-- Ignore if shadowed or already extended
+	  if ( ! target_reached )
+	    if ( CurrCp->wasFused  ||
+		 isShadowedCluster (CurrCp, Alignments, CurrAp) )
+	      {
+		CurrCp->wasFused = true;
+		CurrCp = ++ PrevCp;
+		continue;
+	      }
+	}
+
+      //-- Initialize the right amino acid sequence for A and B if need be
+      if ( A [CurrCp->frameA] == NULL )
+        {
+	  A [CurrCp->frameA] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Af->len / 3) + 2) );
+	  A [CurrCp->frameA] [0] = '\0';
+	  Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA],
+						  CurrCp->frameA );
+        }
+      if ( B [CurrCp->frameB] == NULL )
+        {
+	  B [CurrCp->frameB] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
+	  B [CurrCp->frameB] [0] = '\0';
+	  Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB],
+						  CurrCp->frameB );
+        }
+
+      //-- Extend each match in the cluster
+      for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
+        {
+          //-- Try to extend the current match backwards
+          if ( target_reached )
+            {
+              //-- Merge with the previous match
+              if ( CurrAp->eA != Mp->sA  ||  CurrAp->eB != Mp->sB )
+                {
+		  //-- Strip matches until the targeted match is found
+                  assert (Mp < CurrCp->matches.end( ) - 1);
+                  continue;
+                }
+
+              CurrAp->eA += Mp->len - 1;
+              CurrAp->eB += Mp->len - 1;
+            }
+          else
+            {
+              //-- Create a new alignment object
+              addNewAlignment (Alignments, CurrCp, Mp);
+              CurrAp = Alignments.end( ) - 1;
+
+	      if ( DO_EXTEND  ||  Mp != CurrCp->matches.begin ( ) )
+		{
+		  //-- Target the closest/best alignment object
+		  TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
+		  
+		  //-- Extend the new alignment object backwards
+		  if ( extendBackward (Alignments, CurrAp, TargetAp,
+				       A [CurrCp->frameA], B [CurrCp->frameB]) )
+		    CurrAp = TargetAp;
+		}
+            }
+          
+	  m_o = FORWARD_ALIGN;
+
+          //-- Try to extend the current match forwards
+          if ( Mp < CurrCp->matches.end( ) - 1 )
+            {
+              //-- Target the next match in the cluster
+              targetA = (Mp + 1)->sA;
+              targetB = (Mp + 1)->sB;
+
+	      //-- Extend the current alignment object forward
+	      target_reached = extendForward (CurrAp,
+					      A [CurrCp->frameA], targetA,
+					      B [CurrCp->frameB], targetB, m_o);
+            }
+          else if ( DO_EXTEND )
+            {
+              targetA = Alen [CurrCp->frameA];
+              targetB = Blen [CurrCp->frameB];
+
+              //-- Target the closest/best match in a future cluster
+              TargetCp = getForwardTargetCluster (Clusters, CurrCp,
+                                                  targetA, targetB);
+
+	      if ( TargetCp == Clusters.end( ) )
+		{
+		  m_o |= OPTIMAL_BIT;
+		  if ( TO_SEQEND )
+		    m_o |= SEQEND_BIT;
+		}
+
+	      //-- Extend the current alignment object forward
+	      target_reached = extendForward (CurrAp,
+					      A [CurrCp->frameA], targetA,
+					      B [CurrCp->frameB], targetB, m_o);
+            }
+        }
+
+      if ( TargetCp == Clusters.end( ) )
+        target_reached = false;
+
+      CurrCp->wasFused = true;
+
+      if ( target_reached == false )
+        CurrCp = ++ PrevCp;
+      else
+	CurrCp = TargetCp;
+    }
+
+#ifdef _DEBUG_ASSERT
+  validateData (Alignments, Clusters, Af, Bf);
+#endif
+
+  //-- Output the alignment data to the delta file
+  flushAlignments (Alignments, Af, Bf, DeltaFile);
+
+  for ( i = 0; i < 7; i ++ )
+    {
+      if ( A [i] != NULL )
+	free ( A [i] );
+      if ( B [i] != NULL )
+	free ( B [i] );
+    }
+
+  return;
+}
+
+
+
+
+void flushAlignments
+     (vector<Alignment> & Alignments,
+      const FastaRecord * Af, const FastaRecord * Bf,
+      FILE * DeltaFile)
+
+     //  Simply output the delta information stored in Alignments to the
+     //  given delta file. Free the memory used by Alignments once the
+     //  data is successfully output to the file.
+
+{
+  vector<Alignment>::iterator Ap;       // alignment pointer
+  vector<long int>::iterator Dp;             // delta pointer
+
+  fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
+
+  //-- Generate the error counts
+  parseDelta (Alignments, Af, Bf);
+
+  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
+    {
+      fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
+	       transC (Ap->sA, Ap->frameA, Af->len),
+	       transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2),
+	       transC (Ap->sB, Ap->frameB, Bf->len),
+	       transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2),
+	       Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
+      
+      for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
+        fprintf (DeltaFile, "%ld\n", *Dp);
+      fprintf (DeltaFile, "0\n");
+
+      Ap->delta.clear( );
+    }
+  
+  Alignments.clear( );
+
+  return;
+}
+
+
+
+
+void flushSyntenys
+     (vector<Synteny> & Syntenys, FILE * ClusterFile)
+
+     //  Simply output the synteny/cluster information generated by the mgaps
+     //  program. However, now the coordinates reference their appropriate
+     //  reference sequence, and the reference sequecne header is added to
+     //  the appropriate lines. Free the memory used by Syntenys once the
+     //  data is successfully output to the file.
+
+{
+  vector<Synteny>::iterator Sp;         // synteny pointer 
+  vector<Cluster>::iterator Cp;         // cluster pointer
+  vector<Match>::iterator Mp;           // match pointer
+
+  if ( ClusterFile )
+    {
+      for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
+        {
+          fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
+                   Sp->AfP->len, Sp->Bf.len);
+          for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
+            {
+              fprintf (ClusterFile, "%2d %2d\n",
+                       Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA,
+                       Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB);
+              for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
+                {
+                  fprintf (ClusterFile, "%8ld %8ld %6ld",
+                           transC (Mp->sA, Cp->frameA, Sp->AfP->len),
+                           transC (Mp->sB, Cp->frameB, Sp->Bf.len),
+                           Mp->len * 3);
+	      
+                  if ( Mp != Cp->matches.begin( ) )
+                    fprintf (ClusterFile, "%6ld %6ld\n",
+                             (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3,
+                             (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3);
+                  else
+                    fprintf (ClusterFile, "%6s %6s\n", "-", "-");
+                }
+              Cp->matches.clear( );
+            }
+          Sp->clusters.clear( );
+        }
+    }
+  else
+    {
+      for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
+        {
+          for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
+            {
+              Cp->matches.clear( );
+            }
+          Sp->clusters.clear( );
+        }
+    }
+  
+  Syntenys.clear( );
+
+  return;
+}
+
+
+
+
+
+vector<Cluster>::iterator getForwardTargetCluster
+     (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
+      long int & targetA, long int & targetB)
+
+     //  Return the cluster that is most likely to successfully join (in a
+     //  forward direction) with the current cluster. The returned cluster
+     //  must contain 1 or more matches that are strictly greater than the end
+     //  of the current cluster. The targeted cluster must also be on a
+     //  diagonal close enough to the current cluster, so that a connection
+     //  could possibly be made by the alignment extender. Also, this targeted
+     //  cluster must be consistent in frame with the current cluster. Assumes
+     //  clusters have been sorted via AscendingClusterSort. Returns targeted
+     //  cluster and stores the target coordinates in targetA and targetB. If no
+     //  suitable cluster was found, the function will return NULL and target
+     //  A and targetB will remain unchanged.
+
+{
+  vector<Match>::iterator Mip;               // match iteratrive pointer
+  vector<Cluster>::iterator Cp;              // cluster pointer
+  vector<Cluster>::iterator Cip;             // cluster iterative pointer
+  long int eA, eB;                           // possible target
+  long int greater, lesser;                  // gap sizes between two clusters
+  long int sA = CurrCp->matches.rbegin( )->sA +
+    CurrCp->matches.rbegin( )->len - 1;      // the endA of the current cluster 
+  long int sB = CurrCp->matches.rbegin( )->sB +
+    CurrCp->matches.rbegin( )->len - 1;      // the endB of the current cluster
+
+  //-- End of sequences is the default target, set distance accordingly
+  long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
+
+  //-- For all clusters greater than the current cluster (on sequence A)
+  Cp = Clusters.end( );
+  for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
+    {
+      //-- If the cluster is in the same frame
+      if ( CurrCp->frameA == Cip->frameA  &&  CurrCp->frameB == Cip->frameB )
+        {
+          eA = Cip->matches.begin( )->sA;
+          eB = Cip->matches.begin( )->sB;
+
+          //-- If the cluster overlaps the current cluster, strip some matches
+          if ( ( eA < sA  ||  eB < sB )  &&
+              Cip->matches.rbegin( )->sA >= sA  &&
+              Cip->matches.rbegin( )->sB >= sB )
+            {
+              for ( Mip = Cip->matches.begin( );
+                    Mip < Cip->matches.end( )  &&  ( eA < sA  ||  eB < sB );
+                    Mip ++ )
+                {
+                  eA = Mip->sA;
+                  eB = Mip->sB;
+                }
+            }
+
+          //-- If the cluster is strictly greater than current cluster
+          if ( eA >= sA  &&  eB >= sB )
+            {
+              if ( eA - sA > eB - sB )
+                {
+                  greater = eA - sA;
+                  lesser = eB - sB;
+                }
+              else
+                {
+                  lesser = eA - sA;
+                  greater = eB - sB;
+                }
+
+              //-- If the cluster is close enough
+              if ( greater <= getBreakLen( )  ||
+                   (lesser) * GOOD_SCORE [getMatrixType( )] +
+                   (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
+                {
+                  Cp = Cip;
+                  targetA = eA;
+                  targetB = eB;
+                  break;
+                }
+              else if ( (greater << 1) - lesser < dist )
+                {
+                  Cp = Cip;
+                  targetA = eA;
+                  targetB = eB;
+                  dist = (greater << 1) - lesser;
+                }
+            }
+        }
+    }
+
+
+  return Cp;
+}
+
+
+
+
+
+vector<Alignment>::iterator getReverseTargetAlignment
+     (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
+
+     //  Return the alignment that is most likely to successfully join (in a
+     //  reverse direction) with the current alignment. The returned alignment
+     //  must be strictly less than the current cluster and be on a diagonal
+     //  close enough to the current alignment, so that a connection
+     //  could possibly be made by the alignment extender. Also, this targeted
+     //  alignment must be consistent in frame with the current alignment. 
+     //  Assumes clusters have been sorted via AscendingClusterSort and
+     //  processed in order, so therefore all alignments are in order by their
+     //  start A coordinate.
+
+{
+  vector<Alignment>::iterator Ap;        // alignment pointer
+  vector<Alignment>::iterator Aip;       // alignment iterative pointer
+  long int eA, eB;                       // possible targets
+  long int greater, lesser;              // gap sizes between the two alignments
+  long int sA = CurrAp->sA;              // the startA of the current alignment
+  long int sB = CurrAp->sB;              // the startB of the current alignment
+
+  //-- Beginning of sequences is the default target, set distance accordingly
+  long int dist = (sA < sB ? sA : sB);
+
+  //-- For all alignments less than the current alignment (on sequence A)
+  Ap = Alignments.end( );
+  for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
+    {
+      //-- If the alignment is on the same direction
+      if ( CurrAp->frameA == Aip->frameA  &&  CurrAp->frameB == Aip->frameB )
+        {
+          eA = Aip->eA;
+          eB = Aip->eB;
+
+          //-- If the alignment is strictly greater than current cluster
+          if ( eA <= sA  && eB <= sB )
+            {
+              if ( sA - eA > sB - eB )
+                {
+                  greater = sA - eA;
+                  lesser = sB - eB;
+                }
+              else
+                {
+                  lesser = sA - eA;
+                  greater = sB - eB;
+                }
+
+              //-- If the cluster is close enough
+              if ( greater <= getBreakLen( )  ||
+                   (lesser) * GOOD_SCORE [getMatrixType( )] +
+                   (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
+                {
+                  Ap = Aip;
+                  break;
+                }
+              else if ( (greater << 1) - lesser < dist )
+                {
+                  Ap = Aip;
+                  dist = (greater << 1) - lesser;
+                }
+            }
+        }
+    }
+
+
+  return Ap;
+}
+
+
+
+
+
+bool isShadowedCluster
+     (vector<Cluster>::iterator CurrCp,
+      vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
+
+     //  Check if the current cluster is shadowed by a previously produced
+     //  alignment region. Return true if it is, else false.
+
+{
+  vector<Alignment>::iterator Aip;         // alignment pointer
+
+  long int sA = CurrCp->matches.begin( )->sA;
+  long int eA = CurrCp->matches.rbegin( )->sA +
+                CurrCp->matches.rbegin( )->len - 1;
+  long int sB = CurrCp->matches.begin( )->sB;
+  long int eB = CurrCp->matches.rbegin( )->sB +
+                CurrCp->matches.rbegin( )->len - 1;
+
+  if ( ! Alignments.empty( ) )             // if there are alignments to use
+    {
+      //-- Look backwards in hope of finding a shadowing alignment
+      for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
+        {
+          //-- If in the same frames and shadowing the current cluster, break
+          if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB )
+            if ( Aip->eA >= eA  &&  Aip->eB >= eB )
+              if ( Aip->sA <= sA  &&  Aip->sB <= sB )
+                break;
+        }
+      
+      //-- Return true if the loop was broken, i.e. shadow found
+      if ( Aip >= Alignments.begin( ) )
+        return true;
+    }
+
+  //-- Return false if Alignments was empty or loop was not broken
+  return false;
+}
+
+
+
+
+
+
+void parseAbort
+     (const char * s)
+
+     //  Abort the program if there was an error in parsing file 's'
+
+{
+  fprintf (stderr,
+          "ERROR: Could not parse input from '%s'. \n"
+          "Please check the filename and format, or file a bug report\n", s);
+  exit (EXIT_FAILURE);
+}
+
+
+
+
+void parseDelta
+     (vector<Alignment> & Alignments,
+      const FastaRecord * Af, const FastaRecord *Bf)
+
+     // Use the delta information to generate the error counts for each
+     // alignment, and fill this information into the data type
+
+{
+  //  pointers to Af.seq and Bf.seq
+  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+
+  char ch1, ch2;
+  long int Delta;
+  int Sign;
+  long int i, Apos, Bpos;
+  long int Remain, Total;
+  long int Errors, SimErrors;
+  long int NonAlphas;
+  vector<Alignment>::iterator Ap;
+  vector<long int>::iterator Dp;
+
+  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
+    {      
+      if ( A [Ap->frameA] == NULL )
+	{
+	  A [Ap->frameA] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Af->len / 3) + 2 ) );
+	  A [Ap->frameA] [0] = '\0';
+	  Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA );
+	}
+      if ( B [Ap->frameB] == NULL )
+	{
+	  B [Ap->frameB] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
+	  B [Ap->frameB] [0] = '\0';
+	  Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB );
+	}
+      
+      Apos = Ap->sA;
+      Bpos = Ap->sB;
+
+      Errors = 0;
+      SimErrors = 0;
+      NonAlphas = 0;
+      Remain = Ap->eA - Ap->sA + 1;
+      Total = Remain;
+
+      //-- For all delta's in this alignment
+      for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
+	{
+	  Delta = *Dp;
+	  Sign = Delta > 0 ? 1 : -1;
+	  Delta = labs ( Delta );
+
+	  //-- For all the bases before the next indel
+	  for ( i = 1; i < Delta; i ++ )
+	    {
+	      ch1 = A [Ap->frameA] [Apos ++];
+	      ch2 = B [Ap->frameB] [Bpos ++];
+
+	      if ( !isalpha (ch1) )
+		{
+		  ch1 = STOP_CHAR;
+		  NonAlphas ++;
+		}
+	      if ( !isalpha (ch2) )
+		{
+		  ch2 = STOP_CHAR;
+		  NonAlphas ++;
+		}
+	      
+	      ch1 = toupper(ch1);
+	      ch2 = toupper(ch2);
+	      if ( 1 > MATCH_SCORE
+		   [getMatrixType( )]
+		   [ch1 - 'A']
+		   [ch2 - 'A'] )
+		SimErrors ++;
+	      if ( ch1 != ch2 )
+		Errors ++;      
+	    }
+
+	  //-- Process the current indel
+	  Remain -= i - 1;
+	  Errors ++;
+	  SimErrors ++;
+	  
+	  if ( Sign == 1 )
+	    {
+	      if ( !isalpha (A [Ap->frameA] [Apos ++]) )
+		NonAlphas ++;
+	      Remain --;
+	    }
+	  else
+	    {
+	      if ( !isalpha (B [Ap->frameB] [Bpos ++]) )
+		NonAlphas ++;
+	      Total ++;
+	    }
+	}
+      
+      //-- For all the bases after the final indel
+      for ( i = 0; i < Remain; i ++ )
+	{
+	  //-- Score character match and update error counters
+	  ch1 = A [Ap->frameA] [Apos ++];
+	  ch2 = B [Ap->frameB] [Bpos ++];
+	  
+	  if ( !isalpha (ch1) )
+	    {
+	      ch1 = STOP_CHAR;
+	      NonAlphas ++;
+	    }
+	  if ( !isalpha (ch2) )
+	    {
+	      ch2 = STOP_CHAR;
+	      NonAlphas ++;
+	    }
+	  
+	  ch1 = toupper(ch1);
+	  ch2 = toupper(ch2);
+	  if ( 1 > MATCH_SCORE
+	       [getMatrixType( )]
+	       [ch1 - 'A']
+	       [ch2 - 'A'] )
+	    SimErrors ++;
+	  if ( ch1 != ch2 )
+	    Errors ++;
+	}
+
+      Ap->Errors = Errors;
+      Ap->SimErrors = SimErrors;
+      Ap->NonAlphas = NonAlphas;
+    }
+
+  for ( i = 1; i <= 6; i ++ )
+    {
+      if ( A [i] != NULL )
+        free ( A [i] );
+      A [i] = NULL;
+
+      if ( B [i] != NULL )
+        free ( B [i] );
+      B [i] = NULL;
+    }
+
+  return;
+}
+
+
+
+
+void processSyntenys
+     (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
+      FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
+
+     //  For each syntenic region with clusters, read in the B sequence and
+     //  extend the clusters to expand total alignment coverage. Clusters should
+     //  still reference the amino acid concatenations, the translation back to
+     //  DNA will be taken care of by the extendClusters function.
+     //      processSyntenys should ONLY be called once all the clusters for
+     //  the contained syntenic regions have been stored in the data structure.
+     //  Frees the memory used by the the syntenic regions once the output of 
+     //  extendClusters and flushSyntenys has been produced.
+
+{
+  FastaRecord Bf;                     // the current B sequence
+  Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
+
+  long int InitSize = INIT_SIZE;      // the initial memory capacity of B
+  vector<Synteny>::iterator CurrSp;   // current synteny pointer
+
+  //-- Initialize the B sequence storage
+  Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
+
+  //-- For all the contained syntenys
+  for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
+    {
+      //-- If no clusters, ignore
+      if ( CurrSp->clusters.empty( ) )
+	continue;
+
+      //-- If a B sequence not seen yet, read it in
+      //-- IMPORTANT: The B sequences in the synteny object are assumed to be
+      //      ordered as output by mgaps, if they are not in order the program
+      //      will fail. (All like tags must be adjacent and in the same order
+      //      as the query file)
+      if ( CurrSp == Syntenys.begin( )  ||
+	   strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
+	{
+	  //-- Read in the B sequence
+	  while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
+	    if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
+	      break;
+	  if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
+	    {
+	      fprintf(stderr,"%s\n", CurrSp->Bf.Id);
+	      parseAbort ( "Query File" );
+	    }
+	  Bf.len = strlen (Bf.seq + 1);
+	}
+
+      //-- Extend clusters and create the alignment information
+      CurrSp->Bf.len = Bf.len;
+      extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
+    }
+
+  //-- Create the cluster information and clear the Synteny structure
+  flushSyntenys (Syntenys, ClusterFile);
+
+  free (Bf.Id);
+  free (Bf.seq);
+
+  return;
+}
+
+
+
+inline long int transC
+     (long int Coord, int Frame, long int Len)
+
+     //  Translate an amino acid coordinate to a nucleotide coordinate
+     //  relative to the forward strand. The Len parameter should be the
+     //  length of the original DNA strand.
+
+{
+  assert ( Frame > 0 && Frame < 7 );
+  if ( Frame > 3 )
+    return revC ( (Coord * 3) - (3 - (Frame - 3)), Len );
+  else
+    return (Coord * 3) - (3 - (Frame));
+}
+
+
+
+
+inline long int refLen
+     (long int ntLen)
+
+     //  Return the length of the concatenated amino acid sequence frames,
+     //  including the appended 'X' at the end of each sequence frame for the
+     //  given DNA input length. (The returned length will be the length of
+     //  the concatenated frames for this sequence as output by prepro.cc)
+
+{
+  return (ntLen + 1) << 1;
+}
+
+
+
+
+inline long int revC
+     (long int Coord, long int Len)
+
+     //  Reverse complement the given coordinate for the given length.
+
+{
+  assert (Len - Coord + 1 > 0);
+  return (Len - Coord + 1);
+}
+
+
+
+
+void printHelp
+     (const char * s)
+
+     //  Display the program's help information to stderr.
+
+{
+  fprintf (stderr,
+    "\nUSAGE: %s  [options]  <reference>  <query>  <pfx>  <  <input>\n\n", s);
+  fprintf (stderr,
+     "-b int   set the alignment break (give-up) length to int (amino acids)\n"
+     "-d       output only match clusters rather than extended alignments\n"
+     "-e       do not extend alignments outward from clusters\n"
+     "-h       display help information\n"
+     "-t       force alignment to ends of sequence if within -b distance\n"
+     "-x type  set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n"
+     "         other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n");
+  fprintf (stderr,
+          "  Input is the output of the \"mgaps\" program from stdin, and\n"
+          "the two original PROmer sequence files passed on the command\n"
+          "line. <pfx> is the prefix to be added to the front of the\n"
+          "output file <pfx>.delta\n"
+          "  <pfx>.delta is the alignment object that catalogs the distance\n"
+          "between insertions and deletions. For further information\n"
+          "regarding this file, please refer to the documentation under\n"
+          "the .delta output description.\n\n");
+  return;
+}
+
+
+
+
+void printUsage
+     (const char * s)
+
+     //  Display the program's usage information to stderr.
+
+{
+  fprintf (stderr,
+    "\nUSAGE: %s  [options]  <reference>  <query>  <pfx>  <  <input>\n\n", s);
+  fprintf (stderr, "Try '%s -h' for more information.\n", s);
+  return;
+}
+
+
+
+
+void validateData
+     (vector<Alignment> Alignments, vector<Cluster> Clusters,
+      const FastaRecord * Af, const FastaRecord * Bf)
+
+     //  Self test function to check that the cluster and alignment information
+     //  is valid
+
+{
+  char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+  char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+  
+  long int Alen [7], Blen [7];         // the length of the amino acid sequences
+  vector<Cluster>::iterator Cp;
+  vector<Match>::iterator Mp;
+  vector<Alignment>::iterator Ap;
+  long int x, y, i;
+  char Xc, Yc;
+
+  for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
+    {
+      assert ( Cp->wasFused );
+
+      //-- Initialize the right amino acid sequence for A and B if need be
+      if ( A [Cp->frameA] == NULL )
+        {
+	  A [Cp->frameA] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Af->len / 3) + 2) );
+	  A [Cp->frameA] [0] = '\0';
+	  Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA],
+						  Cp->frameA );
+        }
+      if ( B [Cp->frameB] == NULL )
+        {
+	  B [Cp->frameB] = (char *) Safe_malloc
+	    ( sizeof(char) * ( (Bf->len / 3) + 2) );
+	  B [Cp->frameB] [0] = '\0';
+	  Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB],
+						  Cp->frameB );
+        }
+
+      for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
+	{
+	  //-- assert for each match in cluster, it is indeed a match
+	  x = Mp->sA;
+	  y = Mp->sB;
+	  for ( i = 0; i < Mp->len; i ++ )
+	    assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] );
+
+	  //-- assert for each match in cluster, it is contained in an alignment
+	  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
+	    {
+	      if ( Ap->sA <= Mp->sA  &&  Ap->sB <= Mp->sB  &&
+                   Ap->eA >= Mp->sA + Mp->len - 1  &&
+                   Ap->eB >= Mp->sB + Mp->len - 1 )
+                break;
+            }
+          assert ( Ap < Alignments.end( ) );
+	}
+    }
+
+  //-- assert alignments are optimal (quick check if first and last chars equal)
+  for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
+    {  
+      assert ( Ap->sA <= Ap->eA );
+      assert ( Ap->sB <= Ap->eB );
+      
+      assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
+      assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
+      assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
+      assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
+      
+      Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ?
+		   A[Ap->frameA][Ap->sA] : STOP_CHAR);
+      Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ?
+		   B[Ap->frameB][Ap->sB] : STOP_CHAR);
+      assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
+      
+      Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ?
+		   A[Ap->frameA][Ap->eA] : STOP_CHAR);
+      Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ?
+		   B[Ap->frameB][Ap->eB] : STOP_CHAR);
+      assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
+    }
+
+  for ( i = 0; i < 7; i ++ )
+    {
+      if ( A [i] != NULL )
+	free ( A [i] );
+      if ( B [i] != NULL )
+	free ( B [i] );
+    }
+
+  return;
+}
author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children