Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postnuc.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postnuc.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,1558 @@ +//------------------------------------------------------------------------------ +// Programmer: Adam M Phillippy, The Institute for Genomic Research +// File: postnuc.cc +// Date: 07 / 16 / 2002 +// +// Revision: 08 / 01 / 2002 +// Added MUM extension functionality +// +// Purpose: To translate the coordinates referencing the concatenated +// reference sequences back to the individual sequences and deal +// with any conflict that may have arisen (i.e. a MUM that spans +// the boundry between two sequences). Then to extend each cluster +// via Smith-Waterman techniques to expand the alignment coverage. +// Alignments which encounter each other will be fused to form one +// encompasing alignment when appropriate. +// +// Input: Input is the output of the .mgaps program from stdin. On the +// command line, the file names of the two original sequence files +// should come first, followed by the prefix <pfx> that should be +// placed in front of the two output filenames <pfx>.cluster and +// <pfx>.delta +// +// NOTE: Cluster file is now suppressed by default (see -d option). +// +// Output: Output is to two output files, <pfx>.cluster and <pfx>.delta. +// <pfx>.cluster lists MUM clusters as identified by "mgaps". +// However, the clusters now reference their corresponding +// sequence and are all listed under headers that specify this +// sequence. The <pfx>.delta file is the alignment object that +// contains all the information necessary to reproduce the alignments +// generated by the MUM extension process. Please refer to the +// output file documentation for an in-depth description of these +// file formats. +// +// Usage: postnuc <reference> <query> <pfx> < <input> +// Where <reference> and <query> are the original input sequences of +// NUCmer and <pfx> is the prefix that should be added to the +// beginning of the <pfx>.cluster and <pfx>.delta output filenames. +// +//------------------------------------------------------------------------------ + +//-- NOTE: this option will significantly hamper program performance, +// mostly the alignment extension performance (sw_align.h) +//#define _DEBUG_ASSERT // self testing assert functions + +#include "tigrinc.hh" +#include "sw_align.hh" +#include <vector> +#include <algorithm> +using namespace std; + + +//------------------------------------------------------ Globals -------------// +const signed char FORWARD_CHAR = 1; +const signed char REVERSE_CHAR = -1; + + +bool DO_DELTA = true; +bool DO_EXTEND = true; +bool TO_SEQEND = false; +bool DO_SHADOWS = false; + + +//------------------------------------------------------ Type Definitions ----// +enum LineType + //-- The type of input line from <stdin> +{ + NO_LINE, HEADER_LINE, MATCH_LINE +}; + + +struct FastaRecord + //-- The essential data of a sequence +{ + char * Id; // the fasta ID header tag + long int len; // the length of the sequence + char * seq; // the sequence data +}; + + +struct Match + //-- An exact match between two sequences A and B +{ + long int sA, sB, len; // start coordinate in A, in B and the length +}; + + +struct Cluster + //-- An ordered list of matches between two sequences A and B +{ + bool wasFused; // have the cluster matches been fused yet? + signed char dirB; // the query sequence direction + // FORWARD_CHAR or REVERSE_CHAR + vector<Match> matches; // the ordered set of matches in the cluster +}; + + +struct Synteny + //-- An ordered list of clusters between two sequences A and B +{ + FastaRecord * AfP; // a pointer to the reference sequence record + FastaRecord Bf; // the query sequence record (w/o the sequence) + vector<Cluster> clusters; // the ordered set of clusters between A and B +}; + + +struct Alignment + //-- An alignment object between two sequences A and B +{ + signed char dirB; // the query sequence direction + long int sA, sB, eA, eB; // the start in A, B and the end in A, B + vector<long int> delta; // the delta values, with NO zero at the end + long int deltaApos; // sum of abs(deltas) - #of negative deltas + // trust me, it is a very helpful value + long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas +}; + + +struct AscendingClusterSort + //-- For sorting clusters in ascending order of their sA coordinate +{ + bool operator() (const Cluster & pA, const Cluster & pB) + { + return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); + } +}; + + + + +//------------------------------------------------- Function Declarations ----// +void addNewAlignment + (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, + vector<Match>::iterator Mp); + +bool extendBackward + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, + vector<Alignment>::iterator TargetAp, const char * A, const char * B); + +bool extendForward + (vector<Alignment>::iterator Ap, const char * A, long int targetA, + const char * B, long int targetB, unsigned int m_o); + +void extendClusters + (vector<Cluster> & Clusters, + const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); + +void flushAlignments + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord * Bf, + FILE * DeltaFile); + +void flushSyntenys + (vector<Synteny> & Syntenys, FILE * ClusterFile); + +vector<Cluster>::iterator getForwardTargetCluster + (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, + long int & targetA, long int & targetB); + +vector<Alignment>::iterator getReverseTargetAlignment + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp); + +bool isShadowedCluster + (vector<Cluster>::iterator CurrCp, + vector<Alignment> & Alignments, vector<Alignment>::iterator Ap); + +void parseAbort + (const char *); + +void parseDelta + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord *Bf); + +void processSyntenys + (vector<Synteny> & Syntenys, + FastaRecord * Af, long int As, + FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); + +inline long int revC + (long int Coord, long int Len); + +void printHelp + (const char *); + +void printUsage + (const char *); + +void validateData + (vector<Alignment> Alignments, vector<Cluster> Clusters, + const FastaRecord * Af, const FastaRecord * Bf); + + + + +//-------------------------------------------------- Function Definitions ----// +int main + (int argc, char ** argv) +{ + FastaRecord * Af; // array of all the reference sequences + + vector<Synteny> Syntenys; // vector of all sets of clusters + vector<Synteny>::reverse_iterator CurrSp; // current set of clusters + vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer + + Synteny Asyn; // a single set of clusters + Cluster Aclu; // a single cluster of matches + Match Amat; // a single match + + LineType PrevLine; // the current input line + + bool Found; // temporary search flag + + char Line [MAX_LINE]; // a single line of input + char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers + char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names + char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; + + signed char DirB = FORWARD_CHAR; // the current query strand direction + + long int i; // temporary counter + long int Seqi; // current reference sequence index + long int InitSize; // initial sequence memory space + long int As = 0; // the size of the reference array + long int Ac = 100; // the capacity of the reference array + long int sA, sB, len; // current match start in A, B and length + + FILE * RefFile, * QryFile; // reference and query input files + FILE * ClusterFile, * DeltaFile; // cluster and delta output files + + //-- Set the alignment data type and break length (sw_align.h) + setMatrixType ( NUCLEOTIDE ); + setBreakLen ( 200 ); + setBanding ( 0 ); + + //-- Parse the command line arguments + { + optarg = NULL; + int ch, errflg = 0; + while ( !errflg && ((ch = getopt (argc, argv, "dehB:b:st")) != EOF) ) + switch (ch) + { + case 'b' : + setBreakLen( atoi (optarg) ); + break; + + case 'B' : + setBanding( atoi (optarg) ); + break; + + case 'd' : + DO_DELTA = false; + break; + + case 'e' : + DO_EXTEND = false; + break; + + case 'h' : + printHelp (argv[0]); + exit (EXIT_SUCCESS); + break; + + case 's' : + DO_SHADOWS = true; + break; + + case 't' : + TO_SEQEND = true; + break; + + default : + errflg ++; + } + + if ( errflg > 0 || argc - optind != 3 ) + { + printUsage (argv[0]); + exit (EXIT_FAILURE); + } + } + + //-- Read and create the I/O file names + strcpy (RefFileName, argv[optind ++]); + strcpy (QryFileName, argv[optind ++]); + strcpy (ClusterFileName, argv[optind ++]); + strcpy (DeltaFileName, ClusterFileName); + strcat (ClusterFileName, ".cluster"); + strcat (DeltaFileName, ".delta"); + + //-- Open all the files + RefFile = File_Open (RefFileName, "r"); + QryFile = File_Open (QryFileName, "r"); + if ( DO_DELTA ) + { + ClusterFile = NULL; + DeltaFile = File_Open (DeltaFileName, "w"); + } + else + { + ClusterFile = File_Open (ClusterFileName, "w"); + DeltaFile = NULL; + } + + //-- Print the headers of the output files + if ( DO_DELTA ) + fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); + else + fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); + + //-- Generate the array of the reference sequences + InitSize = INIT_SIZE; + Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); + Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) + { + Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); + strcpy (Af[As].Id, IdA); + + Af[As].len = strlen (Af[As].seq + 1); + + if ( ++ As >= Ac ) + { + Ac *= 2; + Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); + } + + InitSize = INIT_SIZE; + Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + } + fclose (RefFile); + if ( As <= 0 ) + parseAbort ( RefFileName ); + + + + //-- Process the input from <stdin> line by line + PrevLine = NO_LINE; + IdA[0] = '\0'; + IdB[0] = '\0'; + CurrIdB[0] = '\0'; + while ( fgets(Line, MAX_LINE, stdin) != NULL ) + { + + //-- If the current line is a fasta HEADER_LINE + if ( Line[0] == '>' ) + { + if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) + parseAbort ("stdin"); + DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR; + + PrevLine = HEADER_LINE; + } + + + //-- If the current line is a cluster HEADER_LINE + else if ( Line[0] == '#' ) + { + PrevLine = HEADER_LINE; + } + + + //-- If the current line is a MATCH_LINE + else + { + if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) + parseAbort ("stdin"); + + //-- Re-map the reference coordinate back to its original sequence + for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ ) + sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq + if ( Seqi >= As ) + { + fprintf (stderr, + "ERROR: A MUM was found with a start coordinate greater than\n" + " the sequence length, a serious error has occured.\n" + " Please file a bug report\n"); + exit (EXIT_FAILURE); + } + + //-- If the match spans across a sequence boundry + if ( sA + len - 1 > Af[Seqi].len || sA <= 0) + { + fprintf (stderr, + "WARNING: A MUM was found extending beyond the boundry of:\n" + " Reference sequence '>%s'\n\n" + "Please check that the '-n' option is activated on 'mummer2'\n" + "and try again, or file a bug report\n" + "Attempting to continue.\n", Af[Seqi].Id); + continue; + } + + //-- Check and update the current synteny region + if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ) + { + Found = false; + if ( strcmp (IdB, CurrIdB) == 0 ) + { + //-- Has this header been seen before? + for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) + { + if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) + { + if ( Sp->AfP->len != Af[Seqi].len ) + { + fprintf (stderr, + "ERROR: The reference file may contain" + " sequences with non-unique\n" + " header Ids, please check your input" + " files and try again\n"); + exit (EXIT_FAILURE); + } + assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); + CurrSp = Sp; + Found = true; + break; + } + } + } + else + { + //-- New B sequence header, process all the old synteny's + processSyntenys (Syntenys, Af, As, + QryFile, ClusterFile, DeltaFile); + } + + strcpy (IdA, Af[Seqi].Id); + strcpy (IdB, CurrIdB); + + //-- If not seen yet, create a new synteny region + if ( ! Found ) + { + Asyn.AfP = Af + Seqi; + Asyn.Bf.len = -1; + Asyn.Bf.Id = (char *) Safe_malloc + (sizeof(char) * (strlen(IdB) + 1)); + strcpy (Asyn.Bf.Id, IdB); + + Syntenys.push_back (Asyn); + CurrSp = Syntenys.rbegin( ); + } + + //-- Add a new cluster to the current synteny + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); // hack to remove empties + Aclu.wasFused = false; + Aclu.dirB = DirB; + CurrSp->clusters.push_back (Aclu); + } + else if ( PrevLine == HEADER_LINE ) + { + //-- Add a new cluster to the current synteny + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); + Aclu.wasFused = false; + Aclu.dirB = DirB; + CurrSp->clusters.push_back (Aclu); + } + + //-- Add a new match to the current cluster + if ( len > 1 ) + { + Amat.sA = sA; + Amat.sB = sB; + Amat.len = len; + CurrSp->clusters.rbegin( )->matches.push_back (Amat); + } + + PrevLine = MATCH_LINE; + } + } + + //-- Process the left-over syntenys + if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) + if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) + CurrSp->clusters.pop_back( ); + + processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); + fclose (QryFile); + + //-- Free the reference sequences + for ( i = 0; i < As; i ++ ) + { + free (Af[i].Id); + free (Af[i].seq); + } + free (Af); + + return EXIT_SUCCESS; +} + + + + +void addNewAlignment + (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, + vector<Match>::iterator Mp) + + // Create and add a new alignment object based on the current match + // and cluster information pointed to by Cp and Mp. + +{ + Alignment Align; + + //-- Initialize the data + Align.sA = Mp->sA; + Align.sB = Mp->sB; + Align.eA = Mp->sA + Mp->len - 1; + Align.eB = Mp->sB + Mp->len - 1; + Align.dirB = Cp->dirB; + Align.delta.clear( ); + Align.deltaApos = 0; + + //-- Add the alignment object + Alignments.push_back (Align); + + return; +} + + + + +bool extendBackward + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, + vector<Alignment>::iterator TargetAp, const char * A, const char * B) + + // Extend an alignment backwards off of the current alignment object. + // The current alignment object must be freshly created and consist + // only of an exact match (i.e. the delta vector MUST be empty). + // If the TargetAp alignment object is reached by the extension, it will + // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is + // NULL the function will extend as far as possible. It is a strange + // and dangerous function because it can delete CurrAp, so edit with + // caution. Returns true if TargetAp was reached and merged, else false + // Designed only as a subroutine for extendClusters, should be used + // nowhere else. + +{ + bool target_reached = false; + bool overflow_flag = false; + bool double_flag = false; + + vector<long int>::iterator Dp; + + unsigned int m_o; + long int targetA, targetB; + + m_o = BACKWARD_SEARCH; + + //-- Set the target coordinates + if ( TargetAp != Alignments.end( ) ) + { + targetA = TargetAp->eA; + targetB = TargetAp->eB; + } + else + { + targetA = 1; + targetB = 1; + m_o |= OPTIMAL_BIT; + } + + //-- If alignment is too long, bring with bounds and set overflow_flag true + if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) + { + targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) + { + targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; + if ( overflow_flag ) + double_flag = true; + else + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + + + if ( TO_SEQEND && !double_flag ) + m_o |= SEQEND_BIT; + + + //-- Attempt to reach the target + target_reached = alignSearch (A, CurrAp->sA, targetA, + B, CurrAp->sB, targetB, m_o); + + if ( overflow_flag || TargetAp == Alignments.end( ) ) + target_reached = false; + + if ( target_reached ) + { + //-- Merge the two alignment objects + extendForward (TargetAp, A, CurrAp->sA, + B, CurrAp->sB, FORCED_FORWARD_ALIGN); + TargetAp->eA = CurrAp->eA; + TargetAp->eB = CurrAp->eB; + Alignments.pop_back( ); + } + else + { + alignTarget (A, targetA, CurrAp->sA, + B, targetB, CurrAp->sB, + CurrAp->delta, FORCED_FORWARD_ALIGN); + CurrAp->sA = targetA; + CurrAp->sB = targetB; + + //-- Update the deltaApos value for the alignment object + for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) + CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; + } + + return target_reached; +} + + + + +bool extendForward + (vector<Alignment>::iterator CurrAp, const char * A, long int targetA, + const char * B, long int targetB, unsigned int m_o) + + // Extend an alignment forwards off the current alignment object until + // target or end of sequence is reached, and merge the delta values of the + // alignment object with the new delta values generated by the extension. + // Return true if the target was reached, else false + +{ + long int ValA; + long int ValB; + unsigned int Di; + bool target_reached; + bool overflow_flag = false; + bool double_flag = false; + vector<long int>::iterator Dp; + + //-- Set Di to the end of the delta vector + Di = CurrAp->delta.size( ); + + //-- Calculate the distance between the start and end positions + ValA = targetA - CurrAp->eA + 1; + ValB = targetB - CurrAp->eB + 1; + + //-- If the distance is too long, shrink it and set the overflow_flag + if ( ValA > MAX_ALIGNMENT_LENGTH ) + { + targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + if ( ValB > MAX_ALIGNMENT_LENGTH ) + { + targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; + if ( overflow_flag ) + double_flag = true; + else + overflow_flag = true; + m_o |= OPTIMAL_BIT; + } + + if ( double_flag ) + m_o &= ~SEQEND_BIT; + + target_reached = alignTarget (A, CurrAp->eA, targetA, + B, CurrAp->eB, targetB, + CurrAp->delta, m_o); + + //-- Notify user if alignment was chopped short + if ( target_reached && overflow_flag ) + target_reached = false; + + //-- Pick up delta where left off (Di) and merge with new deltas + if ( Di < CurrAp->delta.size( ) ) + { + //-- Merge the deltas + ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; + CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); + if ( CurrAp->delta[Di] == 0 || ValA < 0 ) + { + fprintf(stderr, + "ERROR: failed to merge alignments at position %ld\n" + " Please file a bug report\n", + CurrAp->eA); + exit (EXIT_FAILURE); + } + + //-- Update the deltaApos + for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) + CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; + } + + //-- Set the alignment coordinates + CurrAp->eA = targetA; + CurrAp->eB = targetB; + + return target_reached; +} + + + + +void extendClusters + (vector<Cluster> & Clusters, + const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) + + // Connect all the matches in every cluster between sequences A and B. + // Also, extend alignments off of the front and back of each cluster to + // expand total alignment coverage. When these extensions encounter an + // adjacent cluster, fuse the two regions to create one single + // encompassing region. This routine will create alignment objects from + // these extensions and output the resulting delta information to the + // delta output file. + +{ + //-- Sort the clusters (ascending) by their start coordinate in sequence A + sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); + + + //-- If no delta file is requested + if ( ! DO_DELTA ) + return; + + + bool target_reached = false; // reached the adjacent match or cluster + + char * A, * B; // the sequences A and B + char * Brev = NULL; // the reverse complement of B + + unsigned int m_o; + long int targetA, targetB; // alignment extension targets in A and B + + vector<Match>::iterator Mp; // match pointer + + vector<Cluster>::iterator PrevCp; // where the extensions last left off + vector<Cluster>::iterator CurrCp; // the current cluster being extended + vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster + + vector<Alignment> Alignments; // the vector of alignment objects + vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align + vector<Alignment>::iterator TargetAp; // target align + + + //-- Extend each cluster + A = Af->seq; + PrevCp = Clusters.begin( ); + CurrCp = Clusters.begin( ); + while ( CurrCp < Clusters.end( ) ) + { + if ( DO_EXTEND ) + { + //-- Ignore if shadowed or already extended + if ( ! target_reached ) + if ( CurrCp->wasFused + || + (!DO_SHADOWS && + isShadowedCluster (CurrCp, Alignments, CurrAp)) ) + { + CurrCp->wasFused = true; + CurrCp = ++ PrevCp; + continue; + } + } + + //-- Pick the right directional sequence for B + if ( CurrCp->dirB == FORWARD_CHAR ) + B = Bf->seq; + else if ( Brev != NULL ) + B = Brev; + else + { + Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); + strcpy ( Brev + 1, Bf->seq + 1 ); + Brev[0] = '\0'; + Reverse_Complement (Brev, 1, Bf->len); + B = Brev; + } + + //-- Extend each match in the cluster + for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) + { + //-- Try to extend the current match backwards + if ( target_reached ) + { + //-- Merge with the previous match + if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) + { + if ( Mp >= CurrCp->matches.end( ) - 1 ) + { + fprintf (stderr, + "ERROR: Target match does not exist, please\n" + " file a bug report\n"); + exit (EXIT_FAILURE); + } + continue; + } + CurrAp->eA += Mp->len - 1; + CurrAp->eB += Mp->len - 1; + } + else + { + //-- Create a new alignment object + addNewAlignment (Alignments, CurrCp, Mp); + CurrAp = Alignments.end( ) - 1; + + if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) + { + //-- Target the closest/best alignment object + TargetAp = getReverseTargetAlignment (Alignments, CurrAp); + + //-- Extend the new alignment object backwards + if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) ) + CurrAp = TargetAp; + } + } + + m_o = FORWARD_ALIGN; + + //-- Try to extend the current match forwards + if ( Mp < CurrCp->matches.end( ) - 1 ) + { + //-- Target the next match in the cluster + targetA = (Mp + 1)->sA; + targetB = (Mp + 1)->sB; + + //-- Extend the current alignment object forward + target_reached = extendForward (CurrAp, A, targetA, + B, targetB, m_o); + } + else if ( DO_EXTEND ) + { + targetA = Af->len; + targetB = Bf->len; + + //-- Target the closest/best match in a future cluster + TargetCp = getForwardTargetCluster (Clusters, CurrCp, + targetA, targetB); + if ( TargetCp == Clusters.end( ) ) + { + m_o |= OPTIMAL_BIT; + if ( TO_SEQEND ) + m_o |= SEQEND_BIT; + } + + //-- Extend the current alignment object forward + target_reached = extendForward (CurrAp, A, targetA, + B, targetB, m_o); + } + } + if ( TargetCp == Clusters.end( ) ) + target_reached = false; + + CurrCp->wasFused = true; + + if ( target_reached == false ) + CurrCp = ++ PrevCp; + else + CurrCp = TargetCp; + } + +#ifdef _DEBUG_ASSERT + validateData (Alignments, Clusters, Af, Bf); +#endif + + //-- Output the alignment data to the delta file + flushAlignments (Alignments, Af, Bf, DeltaFile); + + if ( Brev != NULL ) + free (Brev); + + return; +} + + + + +void flushAlignments + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord * Bf, + FILE * DeltaFile) + + // Simply output the delta information stored in Alignments to the + // given delta file. Free the memory used by Alignments once the + // data is successfully output to the file. + +{ + vector<Alignment>::iterator Ap; // alignment pointer + vector<long int>::iterator Dp; // delta pointer + + fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); + + //-- Generate the error counts + parseDelta (Alignments, Af, Bf); + + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + if ( Ap->dirB == FORWARD_CHAR ) + fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", + Ap->sA, Ap->eA, + Ap->sB, Ap->eB, + Ap->Errors, Ap->SimErrors, Ap->NonAlphas); + else + fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", + Ap->sA, Ap->eA, + revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len), + Ap->Errors, Ap->SimErrors, Ap->NonAlphas); + + for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) + fprintf (DeltaFile, "%ld\n", *Dp); + fprintf (DeltaFile, "0\n"); + + Ap->delta.clear( ); + } + + Alignments.clear( ); + + return; +} + + + + +void flushSyntenys + (vector<Synteny> & Syntenys, FILE * ClusterFile) + + // Simply output the synteny/cluster information generated by the mgaps + // program. However, now the coordinates reference their appropriate + // reference sequence, and the reference sequecne header is added to + // the appropriate lines. Free the memory used by Syntenys once the + // data is successfully output to the file. + +{ + vector<Synteny>::iterator Sp; // synteny pointer + vector<Cluster>::iterator Cp; // cluster pointer + vector<Match>::iterator Mp; // match pointer + + if ( ClusterFile ) + { + for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) + { + fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, + Sp->AfP->len, Sp->Bf.len); + for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) + { + fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB); + for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) + { + if ( Cp->dirB == FORWARD_CHAR ) + fprintf (ClusterFile, "%8ld %8ld %6ld", + Mp->sA, Mp->sB, Mp->len); + else + fprintf (ClusterFile, "%8ld %8ld %6ld", + Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len); + if ( Mp != Cp->matches.begin( ) ) + fprintf (ClusterFile, "%6ld %6ld\n", + Mp->sA - (Mp - 1)->sA - (Mp - 1)->len, + Mp->sB - (Mp - 1)->sB - (Mp - 1)->len); + else + fprintf (ClusterFile, "%6s %6s\n", "-", "-"); + } + Cp->matches.clear( ); + } + Sp->clusters.clear( ); + } + } + else + { + for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) + { + for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) + { + Cp->matches.clear( ); + } + Sp->clusters.clear( ); + } + } + + Syntenys.clear( ); + + return; +} + + + + +vector<Cluster>::iterator getForwardTargetCluster + (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, + long int & targetA, long int & targetB) + + // Return the cluster that is most likely to successfully join (in a + // forward direction) with the current cluster. The returned cluster + // must contain 1 or more matches that are strictly greater than the end + // of the current cluster. The targeted cluster must also be on a + // diagonal close enough to the current cluster, so that a connection + // could possibly be made by the alignment extender. Assumes clusters + // have been sorted via AscendingClusterSort. Returns targeted cluster + // and stores the target coordinates in targetA and targetB. If no + // suitable cluster was found, the function will return NULL and target + // A and targetB will remain unchanged. + +{ + vector<Match>::iterator Mip; // match iteratrive pointer + vector<Cluster>::iterator Cp; // cluster pointer + vector<Cluster>::iterator Cip; // cluster iterative pointer + long int eA, eB; // possible target + long int greater, lesser; // gap sizes between two clusters + long int sA = CurrCp->matches.rbegin( )->sA + + CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster + long int sB = CurrCp->matches.rbegin( )->sB + + CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster + + //-- End of sequences is the default target, set distance accordingly + long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); + + //-- For all clusters greater than the current cluster (on sequence A) + Cp = Clusters.end( ); + for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) + { + //-- If the cluster is on the same direction + if ( CurrCp->dirB == Cip->dirB ) + { + eA = Cip->matches.begin( )->sA; + eB = Cip->matches.begin( )->sB; + + //-- If the cluster overlaps the current cluster, strip some matches + if ( ( eA < sA || eB < sB ) && + Cip->matches.rbegin( )->sA >= sA && + Cip->matches.rbegin( )->sB >= sB ) + { + for ( Mip = Cip->matches.begin( ); + Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); + Mip ++ ) + { + eA = Mip->sA; + eB = Mip->sB; + } + } + + //-- If the cluster is strictly greater than current cluster + if ( eA >= sA && eB >= sB ) + { + if ( eA - sA > eB - sB ) + { + greater = eA - sA; + lesser = eB - sB; + } + else + { + lesser = eA - sA; + greater = eB - sB; + } + + //-- If the cluster is close enough + if ( greater < getBreakLen( ) || + (lesser) * GOOD_SCORE [getMatrixType( )] + + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) + { + Cp = Cip; + targetA = eA; + targetB = eB; + break; + } + else if ( (greater << 1) - lesser < dist ) + { + Cp = Cip; + targetA = eA; + targetB = eB; + dist = (greater << 1) - lesser; + } + } + } + } + + + return Cp; +} + + + + +vector<Alignment>::iterator getReverseTargetAlignment + (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp) + + // Return the alignment that is most likely to successfully join (in a + // reverse direction) with the current alignment. The returned alignment + // must be strictly less than the current cluster and be on a diagonal + // close enough to the current alignment, so that a connection + // could possibly be made by the alignment extender. Assumes clusters + // have been sorted via AscendingClusterSort and processed in order, so + // therefore all alignments are in order by their start A coordinate. + +{ + vector<Alignment>::iterator Ap; // alignment pointer + vector<Alignment>::iterator Aip; // alignment iterative pointer + long int eA, eB; // possible targets + long int greater, lesser; // gap sizes between the two alignments + long int sA = CurrAp->sA; // the startA of the current alignment + long int sB = CurrAp->sB; // the startB of the current alignment + + //-- Beginning of sequences is the default target, set distance accordingly + long int dist = (sA < sB ? sA : sB); + + //-- For all alignments less than the current alignment (on sequence A) + Ap = Alignments.end( ); + for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) + { + //-- If the alignment is on the same direction + if ( CurrAp->dirB == Aip->dirB ) + { + eA = Aip->eA; + eB = Aip->eB; + + //-- If the alignment is strictly less than current cluster + if ( eA <= sA && eB <= sB ) + { + if ( sA - eA > sB - eB ) + { + greater = sA - eA; + lesser = sB - eB; + } + else + { + lesser = sA - eA; + greater = sB - eB; + } + + //-- If the cluster is close enough + if ( greater < getBreakLen( ) || + (lesser) * GOOD_SCORE [getMatrixType( )] + + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) + { + Ap = Aip; + break; + } + else if ( (greater << 1) - lesser < dist ) + { + Ap = Aip; + dist = (greater << 1) - lesser; + } + } + } + } + + + return Ap; +} + + + + +bool isShadowedCluster + (vector<Cluster>::iterator CurrCp, + vector<Alignment> & Alignments, vector<Alignment>::iterator Ap) + + // Check if the current cluster is shadowed by a previously produced + // alignment region. Return true if it is, else false. + +{ + vector<Alignment>::iterator Aip; // alignment pointer + + long int sA = CurrCp->matches.begin( )->sA; + long int eA = CurrCp->matches.rbegin( )->sA + + CurrCp->matches.rbegin( )->len - 1; + long int sB = CurrCp->matches.begin( )->sB; + long int eB = CurrCp->matches.rbegin( )->sB + + CurrCp->matches.rbegin( )->len - 1; + + if ( ! Alignments.empty( ) ) // if there are alignments to use + { + //-- Look backwards in hope of finding a shadowing alignment + for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) + { + //-- If in the same direction and shadowing the current cluster, break + if ( Aip->dirB == CurrCp->dirB ) + if ( Aip->eA >= eA && Aip->eB >= eB ) + if ( Aip->sA <= sA && Aip->sB <= sB ) + break; + } + + //-- Return true if the loop was broken, i.e. shadow found + if ( Aip >= Alignments.begin( ) ) + return true; + } + + //-- Return false if Alignments was empty or loop was not broken + return false; +} + + + + +void parseAbort + (const char * s) + + // Abort the program if there was an error in parsing file 's' + +{ + fprintf (stderr, + "ERROR: Could not parse input from '%s'. \n" + "Please check the filename and format, or file a bug report\n", s); + exit (EXIT_FAILURE); +} + + + + +void parseDelta + (vector<Alignment> & Alignments, + const FastaRecord * Af, const FastaRecord *Bf) + + // Use the delta information to generate the error counts for each + // alignment, and fill this information into the data type + +{ + char * A, * B, * Brev = NULL; + char ch1, ch2; + long int Delta; + int Sign; + long int i, Apos, Bpos; + long int Remain, Total; + long int Errors, SimErrors; + long int NonAlphas; + vector<Alignment>::iterator Ap; + vector<long int>::iterator Dp; + + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + A = Af->seq; + B = Bf->seq; + + if ( Ap->dirB == REVERSE_CHAR ) + { + if ( Brev == NULL ) + { + Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); + strcpy ( Brev + 1, Bf->seq + 1 ); + Brev[0] = '\0'; + Reverse_Complement (Brev, 1, Bf->len); + B = Brev; + } + B = Brev; + } + + Apos = Ap->sA; + Bpos = Ap->sB; + + Errors = 0; + SimErrors = 0; + NonAlphas = 0; + Remain = Ap->eA - Ap->sA + 1; + Total = Remain; + + //-- For all delta's in this alignment + for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) + { + Delta = *Dp; + Sign = Delta > 0 ? 1 : -1; + Delta = labs ( Delta ); + + //-- For all the bases before the next indel + for ( i = 1; i < Delta; i ++ ) + { + ch1 = A [Apos ++]; + ch2 = B [Bpos ++]; + + if ( !isalpha (ch1) ) + { + ch1 = STOP_CHAR; + NonAlphas ++; + } + if ( !isalpha (ch2) ) + { + ch2 = STOP_CHAR; + NonAlphas ++; + } + + ch1 = toupper(ch1); + ch2 = toupper(ch2); + if ( 1 > MATCH_SCORE + [getMatrixType( )] + [ch1 - 'A'] + [ch2 - 'A'] ) + SimErrors ++; + if ( ch1 != ch2 ) + Errors ++; + } + + //-- Process the current indel + Remain -= i - 1; + Errors ++; + SimErrors ++; + + if ( Sign == 1 ) + { + Apos ++; + Remain --; + } + else + { + Bpos ++; + Total ++; + } + } + + //-- For all the bases after the final indel + for ( i = 0; i < Remain; i ++ ) + { + //-- Score character match and update error counters + ch1 = A [Apos ++]; + ch2 = B [Bpos ++]; + + if ( !isalpha (ch1) ) + { + ch1 = STOP_CHAR; + NonAlphas ++; + } + if ( !isalpha (ch2) ) + { + ch2 = STOP_CHAR; + NonAlphas ++; + } + + ch1 = toupper(ch1); + ch2 = toupper(ch2); + if ( 1 > MATCH_SCORE + [getMatrixType( )] + [ch1 - 'A'] + [ch2 - 'A'] ) + SimErrors ++; + if ( ch1 != ch2 ) + Errors ++; + } + + Ap->Errors = Errors; + Ap->SimErrors = SimErrors; + Ap->NonAlphas = NonAlphas; + } + + if ( Brev != NULL ) + free ( Brev ); + + return; +} + + + + +void processSyntenys + (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, + FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) + + // For each syntenic region with clusters, read in the B sequence and + // extend the clusters to expand total alignment coverage. Only should + // be called once all the clusters for the contained syntenic regions have + // been stored in the data structure. Frees the memory used by the + // the syntenic regions once the output of extendClusters and + // flushSyntenys has been produced. + +{ + FastaRecord Bf; // the current B sequence + Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); + + long int InitSize = INIT_SIZE; // the initial memory capacity of B + vector<Synteny>::iterator CurrSp; // current synteny pointer + + //-- Initialize the B sequence storage + Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); + + //-- For all the contained syntenys + for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) + { + //-- If no clusters, ignore + if ( CurrSp->clusters.empty( ) ) + continue; + + //-- If a B sequence not seen yet, read it in + //-- IMPORTANT: The B sequences in the synteny object are assumed to be + // ordered as output by mgaps, if they are not in order the program + // will fail. (All like tags must be adjacent and in the same order + // as the query file) + if ( CurrSp == Syntenys.begin( ) || + strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) + { + //-- Read in the B sequence + while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) + if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) + break; + if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) + parseAbort ( "Query File" ); + Bf.len = strlen (Bf.seq + 1); + } + + //-- Extend clusters and create the alignment information + CurrSp->Bf.len = Bf.len; + extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); + } + + //-- Create the cluster information + flushSyntenys (Syntenys, ClusterFile); + + free (Bf.Id); + free (Bf.seq); + + return; +} + + + + +inline long int revC + (long int Coord, long int Len) + + // Reverse complement the given coordinate for the given length. + +{ + assert (Len - Coord + 1 > 0); + return (Len - Coord + 1); +} + + + + +void printHelp + (const char * s) + + // Display the program's help information to stderr. + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); + fprintf (stderr, + "-b int set the alignment break (give-up) length to int\n" + "-B int set the diagonal banding for extension to int\n" + "-d output only match clusters rather than extended alignments\n" + "-e do not extend alignments outward from clusters\n" + "-h display help information\n" + "-s don't remove shadowed alignments, useful for aligning a\n" + " sequence to itself to identify repeats\n" + "-t force alignment to ends of sequence if within -b distance\n\n"); + fprintf (stderr, + " Input is the output of the \"mgaps\" program from stdin, and\n" + "the two original NUCmer sequence files passed on the command\n" + "line. <pfx> is the prefix to be added to the front of the\n" + "output file <pfx>.delta\n" + " <pfx>.delta is the alignment object that catalogs the distance\n" + "between insertions and deletions. For further information\n" + "regarding this file, please refer to the documentation under\n" + "the .delta output description.\n\n"); + return; +} + + + + +void printUsage + (const char * s) + + // Display the program's usage information to stderr. + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); + fprintf (stderr, "Try '%s -h' for more information.\n", s); + return; +} + + + + +void validateData + (vector<Alignment> Alignments, vector<Cluster> Clusters, + const FastaRecord * Af, const FastaRecord * Bf) + + // Self test function to check that the cluster and alignment information + // is valid + +{ + char * A, * B, * Brev = NULL; + vector<Cluster>::iterator Cp; + vector<Match>::iterator Mp; + vector<Alignment>::iterator Ap; + long int x, y, i; + char Xc, Yc; + + A = Af->seq; + + for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) + { + assert ( Cp->wasFused ); + + //-- Pick the right directional sequence for B + if ( Cp->dirB == FORWARD_CHAR ) + B = Bf->seq; + else if ( Brev != NULL ) + B = Brev; + else + { + Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); + strcpy ( Brev + 1, Bf->seq + 1 ); + Brev[0] = '\0'; + Reverse_Complement (Brev, 1, Bf->len); + B = Brev; + } + + for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) + { + //-- assert for each match in cluster, it is indeed a match + x = Mp->sA; + y = Mp->sB; + for ( i = 0; i < Mp->len; i ++ ) + assert ( A[x ++] == B[y ++] ); + + //-- assert for each match in cluster, it is contained in an alignment + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && + Ap->eA >= Mp->sA + Mp->len - 1 && + Ap->eB >= Mp->sB + Mp->len - 1 ) + break; + } + assert ( Ap < Alignments.end( ) ); + } + } + + //-- assert alignments are optimal (quick check if first and last chars equal) + for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) + { + if ( Ap->dirB == REVERSE_CHAR ) + { + assert (Brev != NULL); + B = Brev; + } + else + B = Bf->seq; + + assert ( Ap->sA <= Ap->eA ); + assert ( Ap->sB <= Ap->eB ); + + assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); + assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); + assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); + assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); + + Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR); + Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR); + assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); + + Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR); + Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR); + assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); + } + + if ( Brev != NULL ) + free (Brev); + + return; +}