Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,1665 @@ +//------------------------------------------------------------------------------ +// Programmer: Adam M Phillippy, The Institute for Genomic Research +// File: postpro.cc +// Date: 08 / 20 / 2002 +// +// Purpose: To translate the coordinates referencing the concatenated +// reference sequences back to the individual sequences and deal +// with any conflict that may have arisen (i.e. a MUM that spans +// the boundry between two sequences). Then to extend each cluster +// via Smith-Waterman techniques to expand the alignment coverage. +// Alignments which encounter each other will be fused to form one +// encompasing alignment when appropriate. +// +// Input: Input is the output of the .mgaps program from stdin. On the +// command line, the file names of the two original sequence files +// should come first, followed by the prefix <pfx> that should be +// placed in front of the two output filenames <pfx>.cluster and +// <pfx>.delta +// +// NOTE: Cluster file is now suppressed by default (see -d option). +// +// Output: Output is to two output files, <pfx>.cluster and <pfx>.delta. +// <pfx>.cluster lists MUM clusters as identified by "mgaps". +// However, the clusters now reference their corresponding +// sequence and are all listed under headers that specify this +// sequence. In addition, the coordinates now reference the original +// DNA input and not the amino acid translations. The <pfx>.delta file +// is the alignment object that contains all the information necessary +// to reproduce the alignments generated by the MUM extension process. +// The coordinates in this file reference the original DNA input, however +// the delta values represent amino acids, so 1 delta = 3 nucleotides. +// Please refer to the output file documentation for an in-depth +// description of these file formats. +// +// Usage: postpro <reference> <query> <pfx> < <input> +// Where <reference> and <query> are the original input sequences of +// PROmer and <pfx> is the prefix that should be added to the +// beginning of the <pfx>.cluster and <pfx>.delta output filenames. +// +//------------------------------------------------------------------------------ + +//-- NOTE: this option will significantly hamper program performance, +// mostly the alignment extension performance (sw_align.h) +//#define _DEBUG_ASSERT // self testing assert functions + +#include "tigrinc.hh" +#include "sw_align.hh" +#include "translate.hh" +#include <vector> +#include <algorithm> +using namespace std; + + + + +//------------------------------------------------------ Globals -------------// +bool DO_DELTA = true; +bool DO_EXTEND = true; +bool TO_SEQEND = false; + + +//------------------------------------------------------ Type Definitions ----// +enum LineType + //-- The type of input line from <stdin> +{ + NO_LINE, HEADER_LINE, MATCH_LINE +}; + + +struct FastaRecord + //-- The essential data of a sequence +{ + char * Id; // the fasta ID header tag + long int len; // the length of the sequence + char * seq; // the sequence data +}; + + +struct Match + //-- An exact match between two sequences A and B +{ + long int sA, sB, len; // start coordinate in A, in B and the length +}; + + +struct Cluster + //-- An ordered list of matches between two sequences A and B +{ + bool wasFused; // have the cluster matches been extended yet? + int frameA; // the reference sequence frame (1-6) + int frameB; // the query sequence frame + vector<Match> matches; // the ordered set of matches in the cluster +}; + + +struct Synteny + //-- An ordered list of clusters between two sequences A and B +{ + FastaRecord * AfP; // a pointer to the reference sequence record + FastaRecord Bf; // the query sequence record (w/o the sequence) + vector<Cluster> clusters; // the ordered set of clusters between A and B +}; + + +struct Alignment + //-- An alignment object between two sequences A and B +{ + int frameA; // the reference sequence frame (1-6) + int frameB; // the query sequence frame + long int sA, sB, eA, eB; // the start in A, B and the end in A, B + vector<long int> delta; // the delta values, with NO zero at the end + long int deltaApos; // sum of abs(deltas) - #of negative deltas + // trust me, it is a very helpful value + long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas +}; + + +struct AscendingClusterSort + //-- For sorting clusters in ascending order of their sA coordinate +{ + bool operator() (const Cluster & pA, const Cluster & pB) + { + return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); + } +}; + + + + + +//------------------------------------------------- Function Declarations ----// +void addNewAlignment + (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, + vector<Match>::iterator Mp); + +bool extendBackward + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, + vector<Alignment>::iterator TargetAp, const char * A, const char * B); + +bool extendForward + (vector<Alignment>::iterator Ap, const char * A, long int targetA, + const char * B, long int targetB, unsigned int m_o); + +void extendClusters + (vector<Cluster> & Clusters, + const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile); + +void flushAlignments + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord * Bf, + FILE * DeltaFile); + +void flushSyntenys + (vector<Synteny> & Syntenys, FILE * ClusterFile); + +vector<Cluster>::iterator getForwardTargetCluster + (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, + long int & targetA, long int & targetB); + +vector<Alignment>::iterator getReverseTargetAlignment + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp); + +bool isShadowedCluster + (vector<Cluster>::iterator CurrCp, + vector<Alignment> & Alignments, vector<Alignment>::iterator Ap); + +void parseAbort + (const char *); + +void parseDelta + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord *Bf); + +void processSyntenys + (vector<Synteny> & Syntenys, + FastaRecord * Af, long int As, + FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); + +inline long int transC + (long int Coord, int Frame, long int Len); + +inline long int refLen + (long int ntLen); + +inline long int revC + (long int Coord, long int Len); + +void printHelp + (const char *); + +void printUsage + (const char *); + +void validateData + (vector<Alignment> Alignments, vector<Cluster> Clusters, + const FastaRecord * Af, const FastaRecord * Bf); + + + +//-------------------------------------------------- Function Definitions ----// +int main + (int argc, char ** argv) +{ + FastaRecord * Af; // array of all the reference sequences + + vector<Synteny> Syntenys; // vector of all sets of clusters + vector<Synteny>::reverse_iterator CurrSp; // current set of clusters + vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer + + Synteny Asyn; // a single set of clusters + Cluster Aclu; // a single cluster of matches + Match Amat; // a single match + + LineType PrevLine; // the current input line + + bool Found; // temporary search flag + + char Line [MAX_LINE]; // a single line of input + char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers + char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names + char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; + + int FrameA, FrameB; // reading frames (1-6) + int CurrFrameA, CurrFrameB; + + long int i; // temporary counter + long int Seqi; // current reference sequence index + long int InitSize; // initial sequence memory space + long int As = 0; // the size of the reference array + long int Ac = 100; // the capacity of the reference array + long int sA, sB, len; // current match start in A, B and length + + FILE * RefFile, * QryFile; // reference and query input files + FILE * ClusterFile, * DeltaFile; // cluster and delta output files + + //-- Set the alignment data type and break length (sw_align.h) + setMatrixType ( BLOSUM62 ); + setBreakLen ( 60 ); + + //-- Parse the command line arguments + { + optarg = NULL; + int ch, errflg = 0; + while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) ) + switch (ch) + { + case 'b' : + setBreakLen (atoi (optarg)); + break; + + case 'd' : + DO_DELTA = false; + break; + + case 'e' : + DO_EXTEND = false; + break; + + case 'h' : + printHelp (argv[0]); + exit (EXIT_SUCCESS); + break; + + case 't' : + TO_SEQEND = true; + break; + + case 'x' : + setMatrixType ( atoi (optarg) ); + if ( getMatrixType( ) == NUCLEOTIDE ) + { + fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n", + getMatrixType( )); + setMatrixType ( BLOSUM62 ); + } + break; + + default : + errflg ++; + } + + if ( errflg > 0 || argc - optind != 3 ) + { + printUsage (argv[0]); + exit (EXIT_FAILURE); + } + } + + //-- Read and create the I/O file names + strcpy (RefFileName, argv[optind ++]); + strcpy (QryFileName, argv[optind ++]); + strcpy (ClusterFileName, argv[optind ++]); + strcpy (DeltaFileName, ClusterFileName); + strcat (ClusterFileName, ".cluster"); + strcat (DeltaFileName, ".delta"); + + //-- Open all the files + RefFile = File_Open (RefFileName, "r"); + QryFile = File_Open (QryFileName, "r"); + if ( DO_DELTA ) + { + ClusterFile = NULL; + DeltaFile = File_Open (DeltaFileName, "w"); + } + else + { + ClusterFile = File_Open (ClusterFileName, "w"); + DeltaFile = NULL; + } + + //-- Print the headers of the output files + if ( DO_DELTA ) + fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName); + else + fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName); + + //-- Generate the array of the reference sequences + InitSize = INIT_SIZE; + Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); + Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) + { + Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); + strcpy (Af[As].Id, IdA); + + Af[As].len = strlen (Af[As].seq + 1); + + if ( ++ As >= Ac ) + { + Ac *= 2; + Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); + } + + InitSize = INIT_SIZE; + Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + } + fclose (RefFile); + if ( As <= 0 ) + parseAbort ( RefFileName ); + + + + //-- Process the input from <stdin> line by line + PrevLine = NO_LINE; + IdA[0] = '\0'; + IdB[0] = '\0'; + FrameA = 0; + FrameB = 0; + CurrFrameA = -1; + CurrFrameB = -1; + CurrIdB[0] = '\0'; + while ( fgets(Line, MAX_LINE, stdin) != NULL ) + { + + //-- If the current line is a fasta HEADER_LINE + if ( Line[0] == '>' ) + { + if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) + parseAbort ("stdin"); + + sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB); + CurrIdB [strlen(CurrIdB) - 2] = '\0'; + + PrevLine = HEADER_LINE; + } + + + //-- If the current line is a cluster HEADER_LINE + else if ( Line[0] == '#' ) + { + PrevLine = HEADER_LINE; + } + + + //-- If the current line is a MATCH_LINE + else + { + if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) + parseAbort ("stdin"); + + //-- Re-map the reference coordinate back to its original sequence + // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length + // including the appended 'X' on each frame translation + for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ ) + sA -= refLen (Af[Seqi].len); + + assert ( Seqi < As ); + + //-- Get the correct frame, translate startA coordinate to frame + for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ ) + sA -= (Af[Seqi].len - (i % 3)) / 3 + 1; + CurrFrameA = i + 1; + assert ( CurrFrameA > 0 && CurrFrameA < 7 ); + + //-- If the match spans across a frame boundry + if ( CurrFrameA < 1 || CurrFrameA > 6 || + sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 || + sA <= 0 ) + { + fprintf (stderr, + "\nWARNING: A MUM was found extending beyond the boundry of:\n" + " Reference sequence '>%s', frame %d\n" + "Please file a bug report\n" + "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA); + continue; + } + + //-- If the match spans across a sequence boundry + if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 ) + { + fprintf (stderr, + "\nWARNING: A MUM was found extending beyond the boundry of:\n" + " Reference sequence '>%s'\n" + "Please file a bug report\n" + "Attempting to continue.\n", Af[Seqi].Id); + continue; + } + + //-- Check and update the current synteny region + if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 || + CurrFrameA != FrameA || CurrFrameB != FrameB ) + { + Found = false; + if ( strcmp (IdB, CurrIdB) == 0 ) + { + //-- Has this header been seen before? + for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) + { + if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) + { + assert ( Sp->AfP->len == Af[Seqi].len ); + assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); + CurrSp = Sp; + Found = true; + break; + } + } + } + else + { + //-- New B sequence header, process all the old synteny's + processSyntenys (Syntenys, Af, As, + QryFile, ClusterFile, DeltaFile); + + } + + strcpy (IdA, Af[Seqi].Id); + strcpy (IdB, CurrIdB); + FrameA = CurrFrameA; + FrameB = CurrFrameB; + + //-- If not seen yet, create a new synteny region + if ( ! Found ) + { + Asyn.AfP = Af + Seqi; + Asyn.Bf.len = -1; + Asyn.Bf.Id = (char *) Safe_malloc + (sizeof(char) * (strlen(IdB) + 1)); + strcpy (Asyn.Bf.Id, IdB); + + Syntenys.push_back (Asyn); + CurrSp = Syntenys.rbegin( ); + } + + //-- Add a new cluster to the current synteny + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); // hack to remove empties + Aclu.frameA = FrameA; + Aclu.frameB = FrameB; + Aclu.wasFused = false; + CurrSp->clusters.push_back (Aclu); + } + else if ( PrevLine == HEADER_LINE ) + { + //-- Add a new cluster to the current synteny + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); + Aclu.frameA = FrameA; + Aclu.frameB = FrameB; + Aclu.wasFused = false; + CurrSp->clusters.push_back (Aclu); + } + + //-- Add a new match to the current cluster + // NOTE: A and B coordinates still reference the appropriate + // amino acid sequence frame, not the DNA (same with len) + if ( len > 1 ) + { + Amat.sA = sA; + Amat.sB = sB; + Amat.len = len; + CurrSp->clusters.rbegin( )->matches.push_back (Amat); + } + + PrevLine = MATCH_LINE; + } + } + + //-- Process the left-over syntenys + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); + processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); + fclose (QryFile); + + //-- Free the reference sequences + for ( i = 0; i < As; i ++ ) + { + free (Af[i].Id); + free (Af[i].seq); + } + free (Af); + + return EXIT_SUCCESS; +} + + + + +void addNewAlignment + (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, + vector<Match>::iterator Mp) + + // Create and add a new alignment object based on the current match + // and cluster information pointed to by Cp and Mp. + +{ + Alignment Align; + + //-- Initialize the data + Align.sA = Mp->sA; + Align.sB = Mp->sB; + Align.eA = Mp->sA + Mp->len - 1; + Align.eB = Mp->sB + Mp->len - 1; + Align.frameA = Cp->frameA; + Align.frameB = Cp->frameB; + Align.delta.clear( ); + Align.deltaApos = 0; + + //-- Add the alignment object + Alignments.push_back (Align); + + return; +} + + + + +bool extendBackward + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, + vector<Alignment>::iterator TargetAp, const char * A, const char * B) + + // Extend an alignment backwards off of the current alignment object. + // The current alignment object must be freshly created and consist + // only of an exact match (i.e. the delta vector MUST be empty). + // If the TargetAp alignment object is reached by the extension, it will + // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is + // NULL the function will extend as far as possible. It is a strange + // and dangerous function because it can delete CurrAp, so edit with + // caution. Returns true if TargetAp was reached and merged, else false + // Designed only as a subroutine for extendClusters, should be used + // nowhere else. + +{ + bool target_reached = false; + bool overflow_flag = false; + bool double_flag = false; + + vector<long int>::iterator Dp; + + unsigned int m_o; + long int targetA, targetB; + + m_o = BACKWARD_SEARCH; + + //-- Set the target coordinates + if ( TargetAp != Alignments.end( ) ) + { + targetA = TargetAp->eA; + targetB = TargetAp->eB; + } + else + { + targetA = 1; + targetB = 1; + m_o |= OPTIMAL_BIT; + } + + //-- If alignment is too long, bring with bounds and set overflow_flag true + if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) + { + targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) + { + targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; + if ( overflow_flag ) + double_flag = true; + else + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + + + if ( TO_SEQEND && !double_flag ) + m_o |= SEQEND_BIT; + + + //-- Attempt to reach the target + target_reached = alignSearch (A, CurrAp->sA, targetA, + B, CurrAp->sB, targetB, m_o); + + if ( overflow_flag || TargetAp == Alignments.end( ) ) + target_reached = false; + + if ( target_reached ) + { + //-- assert that CurrAp is new and at the end of the Alignment vector + assert ( CurrAp->delta.empty( ) ); + assert ( CurrAp == Alignments.end( ) - 1 ); + + //-- Merge the two alignment objects + extendForward (TargetAp, A, CurrAp->sA, + B, CurrAp->sB, FORCED_FORWARD_ALIGN); + TargetAp->eA = CurrAp->eA; + TargetAp->eB = CurrAp->eB; + Alignments.pop_back( ); + } + else + { + alignTarget (A, targetA, CurrAp->sA, + B, targetB, CurrAp->sB, + CurrAp->delta, FORCED_FORWARD_ALIGN); + CurrAp->sA = targetA; + CurrAp->sB = targetB; + + //-- Update the deltaApos value for the alignment object + for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) + CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; + } + + return target_reached; +} + + + + +bool extendForward + (vector<Alignment>::iterator CurrAp, const char * A, long int targetA, + const char * B, long int targetB, unsigned int m_o) + + // Extend an alignment forwards off the current alignment object until + // target or end of sequence is reached, and merge the delta values of the + // alignment object with the new delta values generated by the extension. + // Return true if the target was reached, else false + +{ + long int ValA; + long int ValB; + unsigned int Di; + bool target_reached; + bool overflow_flag = false; + bool double_flag = false; + vector<long int>::iterator Dp; + + //-- Set Di to the end of the delta vector + Di = CurrAp->delta.size( ); + + //-- Calculate the distance between the start and end positions + ValA = targetA - CurrAp->eA + 1; + ValB = targetB - CurrAp->eB + 1; + + //-- If the distance is too long, shrink it and set the overflow_flag + if ( ValA > MAX_ALIGNMENT_LENGTH ) + { + targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + if ( ValB > MAX_ALIGNMENT_LENGTH ) + { + targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; + if ( overflow_flag ) + double_flag = true; + else + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + + if ( double_flag ) + m_o &= ~SEQEND_BIT; + + target_reached = alignTarget (A, CurrAp->eA, targetA, + B, CurrAp->eB, targetB, + CurrAp->delta, m_o); + + //-- Notify user if alignment was chopped short + if ( target_reached && overflow_flag ) + target_reached = false; + + //-- Pick up delta where left off (Di) and merge with new deltas + if ( Di < CurrAp->delta.size( ) ) + { + //-- Merge the deltas + ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; + CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); + if ( CurrAp->delta[Di] == 0 || ValA < 0 ) + { + fprintf(stderr, + "ERROR: failed to merge alignments at position %ld\n" + " Please file a bug report\n", + CurrAp->eA); + exit (EXIT_FAILURE); + } + + //-- Update the deltaApos + for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) + CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; + } + + //-- Set the alignment coordinates + CurrAp->eA = targetA; + CurrAp->eB = targetB; + + return target_reached; +} + + + + +void extendClusters + (vector<Cluster> & Clusters, + const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) + + // Connect all the matches in every cluster between sequences A and B. + // Also, extend alignments off of the front and back of each cluster to + // expand total alignment coverage. When these extensions encounter an + // adjacent cluster (in a consistent frame), fuse the two regions to + // create one single encompassing region. This routine will create + // alignment objects from these extensions and output the resulting + // delta information to the delta output file. Also, all coordintes + // for the clusters and the alignment regions are translated to reference + // the original DNA sequecne. + +{ + //-- Sort the clusters (ascending) by their start coordinate in sequence A + sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); + + //-- If no delta file is requested + if ( ! DO_DELTA ) + return; + + + bool target_reached = false; // reached the adjacent match or cluster + // the amino acid sequences for A and B + char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + + long int Alen [7], Blen [7]; // the length of the amino acid sequences + int i; + + unsigned int m_o; + long int targetA, targetB; // alignment extension targets in A and B + + vector<Match>::iterator Mp; // match pointer + + vector<Cluster>::iterator PrevCp; // where the extensions last left off + vector<Cluster>::iterator CurrCp; // the current cluster being extended + vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster + + vector<Alignment> Alignments; // the vector of alignment objects + vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align + vector<Alignment>::iterator TargetAp; // target align + + //-- Calculate the length of each amino acid frame translation + for ( i = 0; i < 6; i ++ ) + { + Alen [i + 1] = (Af->len - (i % 3)) / 3; + Blen [i + 1] = (Bf->len - (i % 3)) / 3; + } + + //-- Extend each cluster + PrevCp = Clusters.begin( ); + CurrCp = Clusters.begin( ); + while ( CurrCp < Clusters.end( ) ) + { + if ( DO_EXTEND ) + { + //-- Ignore if shadowed or already extended + if ( ! target_reached ) + if ( CurrCp->wasFused || + isShadowedCluster (CurrCp, Alignments, CurrAp) ) + { + CurrCp->wasFused = true; + CurrCp = ++ PrevCp; + continue; + } + } + + //-- Initialize the right amino acid sequence for A and B if need be + if ( A [CurrCp->frameA] == NULL ) + { + A [CurrCp->frameA] = (char *) Safe_malloc + ( sizeof(char) * ( (Af->len / 3) + 2) ); + A [CurrCp->frameA] [0] = '\0'; + Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA], + CurrCp->frameA ); + } + if ( B [CurrCp->frameB] == NULL ) + { + B [CurrCp->frameB] = (char *) Safe_malloc + ( sizeof(char) * ( (Bf->len / 3) + 2) ); + B [CurrCp->frameB] [0] = '\0'; + Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB], + CurrCp->frameB ); + } + + //-- Extend each match in the cluster + for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) + { + //-- Try to extend the current match backwards + if ( target_reached ) + { + //-- Merge with the previous match + if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) + { + //-- Strip matches until the targeted match is found + assert (Mp < CurrCp->matches.end( ) - 1); + continue; + } + + CurrAp->eA += Mp->len - 1; + CurrAp->eB += Mp->len - 1; + } + else + { + //-- Create a new alignment object + addNewAlignment (Alignments, CurrCp, Mp); + CurrAp = Alignments.end( ) - 1; + + if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) + { + //-- Target the closest/best alignment object + TargetAp = getReverseTargetAlignment (Alignments, CurrAp); + + //-- Extend the new alignment object backwards + if ( extendBackward (Alignments, CurrAp, TargetAp, + A [CurrCp->frameA], B [CurrCp->frameB]) ) + CurrAp = TargetAp; + } + } + + m_o = FORWARD_ALIGN; + + //-- Try to extend the current match forwards + if ( Mp < CurrCp->matches.end( ) - 1 ) + { + //-- Target the next match in the cluster + targetA = (Mp + 1)->sA; + targetB = (Mp + 1)->sB; + + //-- Extend the current alignment object forward + target_reached = extendForward (CurrAp, + A [CurrCp->frameA], targetA, + B [CurrCp->frameB], targetB, m_o); + } + else if ( DO_EXTEND ) + { + targetA = Alen [CurrCp->frameA]; + targetB = Blen [CurrCp->frameB]; + + //-- Target the closest/best match in a future cluster + TargetCp = getForwardTargetCluster (Clusters, CurrCp, + targetA, targetB); + + if ( TargetCp == Clusters.end( ) ) + { + m_o |= OPTIMAL_BIT; + if ( TO_SEQEND ) + m_o |= SEQEND_BIT; + } + + //-- Extend the current alignment object forward + target_reached = extendForward (CurrAp, + A [CurrCp->frameA], targetA, + B [CurrCp->frameB], targetB, m_o); + } + } + + if ( TargetCp == Clusters.end( ) ) + target_reached = false; + + CurrCp->wasFused = true; + + if ( target_reached == false ) + CurrCp = ++ PrevCp; + else + CurrCp = TargetCp; + } + +#ifdef _DEBUG_ASSERT + validateData (Alignments, Clusters, Af, Bf); +#endif + + //-- Output the alignment data to the delta file + flushAlignments (Alignments, Af, Bf, DeltaFile); + + for ( i = 0; i < 7; i ++ ) + { + if ( A [i] != NULL ) + free ( A [i] ); + if ( B [i] != NULL ) + free ( B [i] ); + } + + return; +} + + + + +void flushAlignments + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord * Bf, + FILE * DeltaFile) + + // Simply output the delta information stored in Alignments to the + // given delta file. Free the memory used by Alignments once the + // data is successfully output to the file. + +{ + vector<Alignment>::iterator Ap; // alignment pointer + vector<long int>::iterator Dp; // delta pointer + + fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); + + //-- Generate the error counts + parseDelta (Alignments, Af, Bf); + + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", + transC (Ap->sA, Ap->frameA, Af->len), + transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2), + transC (Ap->sB, Ap->frameB, Bf->len), + transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2), + Ap->Errors, Ap->SimErrors, Ap->NonAlphas); + + for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) + fprintf (DeltaFile, "%ld\n", *Dp); + fprintf (DeltaFile, "0\n"); + + Ap->delta.clear( ); + } + + Alignments.clear( ); + + return; +} + + + + +void flushSyntenys + (vector<Synteny> & Syntenys, FILE * ClusterFile) + + // Simply output the synteny/cluster information generated by the mgaps + // program. However, now the coordinates reference their appropriate + // reference sequence, and the reference sequecne header is added to + // the appropriate lines. Free the memory used by Syntenys once the + // data is successfully output to the file. + +{ + vector<Synteny>::iterator Sp; // synteny pointer + vector<Cluster>::iterator Cp; // cluster pointer + vector<Match>::iterator Mp; // match pointer + + if ( ClusterFile ) + { + for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) + { + fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, + Sp->AfP->len, Sp->Bf.len); + for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) + { + fprintf (ClusterFile, "%2d %2d\n", + Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA, + Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB); + for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) + { + fprintf (ClusterFile, "%8ld %8ld %6ld", + transC (Mp->sA, Cp->frameA, Sp->AfP->len), + transC (Mp->sB, Cp->frameB, Sp->Bf.len), + Mp->len * 3); + + if ( Mp != Cp->matches.begin( ) ) + fprintf (ClusterFile, "%6ld %6ld\n", + (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3, + (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3); + else + fprintf (ClusterFile, "%6s %6s\n", "-", "-"); + } + Cp->matches.clear( ); + } + Sp->clusters.clear( ); + } + } + else + { + for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) + { + for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) + { + Cp->matches.clear( ); + } + Sp->clusters.clear( ); + } + } + + Syntenys.clear( ); + + return; +} + + + + + +vector<Cluster>::iterator getForwardTargetCluster + (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, + long int & targetA, long int & targetB) + + // Return the cluster that is most likely to successfully join (in a + // forward direction) with the current cluster. The returned cluster + // must contain 1 or more matches that are strictly greater than the end + // of the current cluster. The targeted cluster must also be on a + // diagonal close enough to the current cluster, so that a connection + // could possibly be made by the alignment extender. Also, this targeted + // cluster must be consistent in frame with the current cluster. Assumes + // clusters have been sorted via AscendingClusterSort. Returns targeted + // cluster and stores the target coordinates in targetA and targetB. If no + // suitable cluster was found, the function will return NULL and target + // A and targetB will remain unchanged. + +{ + vector<Match>::iterator Mip; // match iteratrive pointer + vector<Cluster>::iterator Cp; // cluster pointer + vector<Cluster>::iterator Cip; // cluster iterative pointer + long int eA, eB; // possible target + long int greater, lesser; // gap sizes between two clusters + long int sA = CurrCp->matches.rbegin( )->sA + + CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster + long int sB = CurrCp->matches.rbegin( )->sB + + CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster + + //-- End of sequences is the default target, set distance accordingly + long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); + + //-- For all clusters greater than the current cluster (on sequence A) + Cp = Clusters.end( ); + for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) + { + //-- If the cluster is in the same frame + if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB ) + { + eA = Cip->matches.begin( )->sA; + eB = Cip->matches.begin( )->sB; + + //-- If the cluster overlaps the current cluster, strip some matches + if ( ( eA < sA || eB < sB ) && + Cip->matches.rbegin( )->sA >= sA && + Cip->matches.rbegin( )->sB >= sB ) + { + for ( Mip = Cip->matches.begin( ); + Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); + Mip ++ ) + { + eA = Mip->sA; + eB = Mip->sB; + } + } + + //-- If the cluster is strictly greater than current cluster + if ( eA >= sA && eB >= sB ) + { + if ( eA - sA > eB - sB ) + { + greater = eA - sA; + lesser = eB - sB; + } + else + { + lesser = eA - sA; + greater = eB - sB; + } + + //-- If the cluster is close enough + if ( greater <= getBreakLen( ) || + (lesser) * GOOD_SCORE [getMatrixType( )] + + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) + { + Cp = Cip; + targetA = eA; + targetB = eB; + break; + } + else if ( (greater << 1) - lesser < dist ) + { + Cp = Cip; + targetA = eA; + targetB = eB; + dist = (greater << 1) - lesser; + } + } + } + } + + + return Cp; +} + + + + + +vector<Alignment>::iterator getReverseTargetAlignment + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp) + + // Return the alignment that is most likely to successfully join (in a + // reverse direction) with the current alignment. The returned alignment + // must be strictly less than the current cluster and be on a diagonal + // close enough to the current alignment, so that a connection + // could possibly be made by the alignment extender. Also, this targeted + // alignment must be consistent in frame with the current alignment. + // Assumes clusters have been sorted via AscendingClusterSort and + // processed in order, so therefore all alignments are in order by their + // start A coordinate. + +{ + vector<Alignment>::iterator Ap; // alignment pointer + vector<Alignment>::iterator Aip; // alignment iterative pointer + long int eA, eB; // possible targets + long int greater, lesser; // gap sizes between the two alignments + long int sA = CurrAp->sA; // the startA of the current alignment + long int sB = CurrAp->sB; // the startB of the current alignment + + //-- Beginning of sequences is the default target, set distance accordingly + long int dist = (sA < sB ? sA : sB); + + //-- For all alignments less than the current alignment (on sequence A) + Ap = Alignments.end( ); + for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) + { + //-- If the alignment is on the same direction + if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB ) + { + eA = Aip->eA; + eB = Aip->eB; + + //-- If the alignment is strictly greater than current cluster + if ( eA <= sA && eB <= sB ) + { + if ( sA - eA > sB - eB ) + { + greater = sA - eA; + lesser = sB - eB; + } + else + { + lesser = sA - eA; + greater = sB - eB; + } + + //-- If the cluster is close enough + if ( greater <= getBreakLen( ) || + (lesser) * GOOD_SCORE [getMatrixType( )] + + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) + { + Ap = Aip; + break; + } + else if ( (greater << 1) - lesser < dist ) + { + Ap = Aip; + dist = (greater << 1) - lesser; + } + } + } + } + + + return Ap; +} + + + + + +bool isShadowedCluster + (vector<Cluster>::iterator CurrCp, + vector<Alignment> & Alignments, vector<Alignment>::iterator Ap) + + // Check if the current cluster is shadowed by a previously produced + // alignment region. Return true if it is, else false. + +{ + vector<Alignment>::iterator Aip; // alignment pointer + + long int sA = CurrCp->matches.begin( )->sA; + long int eA = CurrCp->matches.rbegin( )->sA + + CurrCp->matches.rbegin( )->len - 1; + long int sB = CurrCp->matches.begin( )->sB; + long int eB = CurrCp->matches.rbegin( )->sB + + CurrCp->matches.rbegin( )->len - 1; + + if ( ! Alignments.empty( ) ) // if there are alignments to use + { + //-- Look backwards in hope of finding a shadowing alignment + for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) + { + //-- If in the same frames and shadowing the current cluster, break + if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB ) + if ( Aip->eA >= eA && Aip->eB >= eB ) + if ( Aip->sA <= sA && Aip->sB <= sB ) + break; + } + + //-- Return true if the loop was broken, i.e. shadow found + if ( Aip >= Alignments.begin( ) ) + return true; + } + + //-- Return false if Alignments was empty or loop was not broken + return false; +} + + + + + + +void parseAbort + (const char * s) + + // Abort the program if there was an error in parsing file 's' + +{ + fprintf (stderr, + "ERROR: Could not parse input from '%s'. \n" + "Please check the filename and format, or file a bug report\n", s); + exit (EXIT_FAILURE); +} + + + + +void parseDelta + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord *Bf) + + // Use the delta information to generate the error counts for each + // alignment, and fill this information into the data type + +{ + // pointers to Af.seq and Bf.seq + char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + + char ch1, ch2; + long int Delta; + int Sign; + long int i, Apos, Bpos; + long int Remain, Total; + long int Errors, SimErrors; + long int NonAlphas; + vector<Alignment>::iterator Ap; + vector<long int>::iterator Dp; + + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + if ( A [Ap->frameA] == NULL ) + { + A [Ap->frameA] = (char *) Safe_malloc + ( sizeof(char) * ( (Af->len / 3) + 2 ) ); + A [Ap->frameA] [0] = '\0'; + Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA ); + } + if ( B [Ap->frameB] == NULL ) + { + B [Ap->frameB] = (char *) Safe_malloc + ( sizeof(char) * ( (Bf->len / 3) + 2) ); + B [Ap->frameB] [0] = '\0'; + Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB ); + } + + Apos = Ap->sA; + Bpos = Ap->sB; + + Errors = 0; + SimErrors = 0; + NonAlphas = 0; + Remain = Ap->eA - Ap->sA + 1; + Total = Remain; + + //-- For all delta's in this alignment + for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) + { + Delta = *Dp; + Sign = Delta > 0 ? 1 : -1; + Delta = labs ( Delta ); + + //-- For all the bases before the next indel + for ( i = 1; i < Delta; i ++ ) + { + ch1 = A [Ap->frameA] [Apos ++]; + ch2 = B [Ap->frameB] [Bpos ++]; + + if ( !isalpha (ch1) ) + { + ch1 = STOP_CHAR; + NonAlphas ++; + } + if ( !isalpha (ch2) ) + { + ch2 = STOP_CHAR; + NonAlphas ++; + } + + ch1 = toupper(ch1); + ch2 = toupper(ch2); + if ( 1 > MATCH_SCORE + [getMatrixType( )] + [ch1 - 'A'] + [ch2 - 'A'] ) + SimErrors ++; + if ( ch1 != ch2 ) + Errors ++; + } + + //-- Process the current indel + Remain -= i - 1; + Errors ++; + SimErrors ++; + + if ( Sign == 1 ) + { + if ( !isalpha (A [Ap->frameA] [Apos ++]) ) + NonAlphas ++; + Remain --; + } + else + { + if ( !isalpha (B [Ap->frameB] [Bpos ++]) ) + NonAlphas ++; + Total ++; + } + } + + //-- For all the bases after the final indel + for ( i = 0; i < Remain; i ++ ) + { + //-- Score character match and update error counters + ch1 = A [Ap->frameA] [Apos ++]; + ch2 = B [Ap->frameB] [Bpos ++]; + + if ( !isalpha (ch1) ) + { + ch1 = STOP_CHAR; + NonAlphas ++; + } + if ( !isalpha (ch2) ) + { + ch2 = STOP_CHAR; + NonAlphas ++; + } + + ch1 = toupper(ch1); + ch2 = toupper(ch2); + if ( 1 > MATCH_SCORE + [getMatrixType( )] + [ch1 - 'A'] + [ch2 - 'A'] ) + SimErrors ++; + if ( ch1 != ch2 ) + Errors ++; + } + + Ap->Errors = Errors; + Ap->SimErrors = SimErrors; + Ap->NonAlphas = NonAlphas; + } + + for ( i = 1; i <= 6; i ++ ) + { + if ( A [i] != NULL ) + free ( A [i] ); + A [i] = NULL; + + if ( B [i] != NULL ) + free ( B [i] ); + B [i] = NULL; + } + + return; +} + + + + +void processSyntenys + (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, + FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) + + // For each syntenic region with clusters, read in the B sequence and + // extend the clusters to expand total alignment coverage. Clusters should + // still reference the amino acid concatenations, the translation back to + // DNA will be taken care of by the extendClusters function. + // processSyntenys should ONLY be called once all the clusters for + // the contained syntenic regions have been stored in the data structure. + // Frees the memory used by the the syntenic regions once the output of + // extendClusters and flushSyntenys has been produced. + +{ + FastaRecord Bf; // the current B sequence + Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); + + long int InitSize = INIT_SIZE; // the initial memory capacity of B + vector<Synteny>::iterator CurrSp; // current synteny pointer + + //-- Initialize the B sequence storage + Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + + //-- For all the contained syntenys + for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) + { + //-- If no clusters, ignore + if ( CurrSp->clusters.empty( ) ) + continue; + + //-- If a B sequence not seen yet, read it in + //-- IMPORTANT: The B sequences in the synteny object are assumed to be + // ordered as output by mgaps, if they are not in order the program + // will fail. (All like tags must be adjacent and in the same order + // as the query file) + if ( CurrSp == Syntenys.begin( ) || + strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) + { + //-- Read in the B sequence + while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) + if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) + break; + if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) + { + fprintf(stderr,"%s\n", CurrSp->Bf.Id); + parseAbort ( "Query File" ); + } + Bf.len = strlen (Bf.seq + 1); + } + + //-- Extend clusters and create the alignment information + CurrSp->Bf.len = Bf.len; + extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); + } + + //-- Create the cluster information and clear the Synteny structure + flushSyntenys (Syntenys, ClusterFile); + + free (Bf.Id); + free (Bf.seq); + + return; +} + + + +inline long int transC + (long int Coord, int Frame, long int Len) + + // Translate an amino acid coordinate to a nucleotide coordinate + // relative to the forward strand. The Len parameter should be the + // length of the original DNA strand. + +{ + assert ( Frame > 0 && Frame < 7 ); + if ( Frame > 3 ) + return revC ( (Coord * 3) - (3 - (Frame - 3)), Len ); + else + return (Coord * 3) - (3 - (Frame)); +} + + + + +inline long int refLen + (long int ntLen) + + // Return the length of the concatenated amino acid sequence frames, + // including the appended 'X' at the end of each sequence frame for the + // given DNA input length. (The returned length will be the length of + // the concatenated frames for this sequence as output by prepro.cc) + +{ + return (ntLen + 1) << 1; +} + + + + +inline long int revC + (long int Coord, long int Len) + + // Reverse complement the given coordinate for the given length. + +{ + assert (Len - Coord + 1 > 0); + return (Len - Coord + 1); +} + + + + +void printHelp + (const char * s) + + // Display the program's help information to stderr. + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); + fprintf (stderr, + "-b int set the alignment break (give-up) length to int (amino acids)\n" + "-d output only match clusters rather than extended alignments\n" + "-e do not extend alignments outward from clusters\n" + "-h display help information\n" + "-t force alignment to ends of sequence if within -b distance\n" + "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n" + " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n"); + fprintf (stderr, + " Input is the output of the \"mgaps\" program from stdin, and\n" + "the two original PROmer sequence files passed on the command\n" + "line. <pfx> is the prefix to be added to the front of the\n" + "output file <pfx>.delta\n" + " <pfx>.delta is the alignment object that catalogs the distance\n" + "between insertions and deletions. For further information\n" + "regarding this file, please refer to the documentation under\n" + "the .delta output description.\n\n"); + return; +} + + + + +void printUsage + (const char * s) + + // Display the program's usage information to stderr. + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); + fprintf (stderr, "Try '%s -h' for more information.\n", s); + return; +} + + + + +void validateData + (vector<Alignment> Alignments, vector<Cluster> Clusters, + const FastaRecord * Af, const FastaRecord * Bf) + + // Self test function to check that the cluster and alignment information + // is valid + +{ + char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + + long int Alen [7], Blen [7]; // the length of the amino acid sequences + vector<Cluster>::iterator Cp; + vector<Match>::iterator Mp; + vector<Alignment>::iterator Ap; + long int x, y, i; + char Xc, Yc; + + for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) + { + assert ( Cp->wasFused ); + + //-- Initialize the right amino acid sequence for A and B if need be + if ( A [Cp->frameA] == NULL ) + { + A [Cp->frameA] = (char *) Safe_malloc + ( sizeof(char) * ( (Af->len / 3) + 2) ); + A [Cp->frameA] [0] = '\0'; + Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA], + Cp->frameA ); + } + if ( B [Cp->frameB] == NULL ) + { + B [Cp->frameB] = (char *) Safe_malloc + ( sizeof(char) * ( (Bf->len / 3) + 2) ); + B [Cp->frameB] [0] = '\0'; + Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB], + Cp->frameB ); + } + + for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) + { + //-- assert for each match in cluster, it is indeed a match + x = Mp->sA; + y = Mp->sB; + for ( i = 0; i < Mp->len; i ++ ) + assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] ); + + //-- assert for each match in cluster, it is contained in an alignment + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && + Ap->eA >= Mp->sA + Mp->len - 1 && + Ap->eB >= Mp->sB + Mp->len - 1 ) + break; + } + assert ( Ap < Alignments.end( ) ); + } + } + + //-- assert alignments are optimal (quick check if first and last chars equal) + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + assert ( Ap->sA <= Ap->eA ); + assert ( Ap->sB <= Ap->eB ); + + assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); + assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); + assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); + assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); + + Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ? + A[Ap->frameA][Ap->sA] : STOP_CHAR); + Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ? + B[Ap->frameB][Ap->sB] : STOP_CHAR); + assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); + + Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ? + A[Ap->frameA][Ap->eA] : STOP_CHAR); + Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ? + B[Ap->frameB][Ap->eB] : STOP_CHAR); + assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); + } + + for ( i = 0; i < 7; i ++ ) + { + if ( A [i] != NULL ) + free ( A [i] ); + if ( B [i] != NULL ) + free ( B [i] ); + } + + return; +}