Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line source
//------------------------------------------------------------------------------ // Programmer: Adam M Phillippy, The Institute for Genomic Research // File: postpro.cc // Date: 08 / 20 / 2002 // // Purpose: To translate the coordinates referencing the concatenated // reference sequences back to the individual sequences and deal // with any conflict that may have arisen (i.e. a MUM that spans // the boundry between two sequences). Then to extend each cluster // via Smith-Waterman techniques to expand the alignment coverage. // Alignments which encounter each other will be fused to form one // encompasing alignment when appropriate. // // Input: Input is the output of the .mgaps program from stdin. On the // command line, the file names of the two original sequence files // should come first, followed by the prefix <pfx> that should be // placed in front of the two output filenames <pfx>.cluster and // <pfx>.delta // // NOTE: Cluster file is now suppressed by default (see -d option). // // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta. // <pfx>.cluster lists MUM clusters as identified by "mgaps". // However, the clusters now reference their corresponding // sequence and are all listed under headers that specify this // sequence. In addition, the coordinates now reference the original // DNA input and not the amino acid translations. The <pfx>.delta file // is the alignment object that contains all the information necessary // to reproduce the alignments generated by the MUM extension process. // The coordinates in this file reference the original DNA input, however // the delta values represent amino acids, so 1 delta = 3 nucleotides. // Please refer to the output file documentation for an in-depth // description of these file formats. // // Usage: postpro <reference> <query> <pfx> < <input> // Where <reference> and <query> are the original input sequences of // PROmer and <pfx> is the prefix that should be added to the // beginning of the <pfx>.cluster and <pfx>.delta output filenames. // //------------------------------------------------------------------------------ //-- NOTE: this option will significantly hamper program performance, // mostly the alignment extension performance (sw_align.h) //#define _DEBUG_ASSERT // self testing assert functions #include "tigrinc.hh" #include "sw_align.hh" #include "translate.hh" #include <vector> #include <algorithm> using namespace std; //------------------------------------------------------ Globals -------------// bool DO_DELTA = true; bool DO_EXTEND = true; bool TO_SEQEND = false; //------------------------------------------------------ Type Definitions ----// enum LineType //-- The type of input line from <stdin> { NO_LINE, HEADER_LINE, MATCH_LINE }; struct FastaRecord //-- The essential data of a sequence { char * Id; // the fasta ID header tag long int len; // the length of the sequence char * seq; // the sequence data }; struct Match //-- An exact match between two sequences A and B { long int sA, sB, len; // start coordinate in A, in B and the length }; struct Cluster //-- An ordered list of matches between two sequences A and B { bool wasFused; // have the cluster matches been extended yet? int frameA; // the reference sequence frame (1-6) int frameB; // the query sequence frame vector<Match> matches; // the ordered set of matches in the cluster }; struct Synteny //-- An ordered list of clusters between two sequences A and B { FastaRecord * AfP; // a pointer to the reference sequence record FastaRecord Bf; // the query sequence record (w/o the sequence) vector<Cluster> clusters; // the ordered set of clusters between A and B }; struct Alignment //-- An alignment object between two sequences A and B { int frameA; // the reference sequence frame (1-6) int frameB; // the query sequence frame long int sA, sB, eA, eB; // the start in A, B and the end in A, B vector<long int> delta; // the delta values, with NO zero at the end long int deltaApos; // sum of abs(deltas) - #of negative deltas // trust me, it is a very helpful value long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas }; struct AscendingClusterSort //-- For sorting clusters in ascending order of their sA coordinate { bool operator() (const Cluster & pA, const Cluster & pB) { return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); } }; //------------------------------------------------- Function Declarations ----// void addNewAlignment (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, vector<Match>::iterator Mp); bool extendBackward (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, vector<Alignment>::iterator TargetAp, const char * A, const char * B); bool extendForward (vector<Alignment>::iterator Ap, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o); void extendClusters (vector<Cluster> & Clusters, const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile); void flushAlignments (vector<Alignment> & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); void flushSyntenys (vector<Synteny> & Syntenys, FILE * ClusterFile); vector<Cluster>::iterator getForwardTargetCluster (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, long int & targetA, long int & targetB); vector<Alignment>::iterator getReverseTargetAlignment (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp); bool isShadowedCluster (vector<Cluster>::iterator CurrCp, vector<Alignment> & Alignments, vector<Alignment>::iterator Ap); void parseAbort (const char *); void parseDelta (vector<Alignment> & Alignments, const FastaRecord * Af, const FastaRecord *Bf); void processSyntenys (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); inline long int transC (long int Coord, int Frame, long int Len); inline long int refLen (long int ntLen); inline long int revC (long int Coord, long int Len); void printHelp (const char *); void printUsage (const char *); void validateData (vector<Alignment> Alignments, vector<Cluster> Clusters, const FastaRecord * Af, const FastaRecord * Bf); //-------------------------------------------------- Function Definitions ----// int main (int argc, char ** argv) { FastaRecord * Af; // array of all the reference sequences vector<Synteny> Syntenys; // vector of all sets of clusters vector<Synteny>::reverse_iterator CurrSp; // current set of clusters vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer Synteny Asyn; // a single set of clusters Cluster Aclu; // a single cluster of matches Match Amat; // a single match LineType PrevLine; // the current input line bool Found; // temporary search flag char Line [MAX_LINE]; // a single line of input char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; int FrameA, FrameB; // reading frames (1-6) int CurrFrameA, CurrFrameB; long int i; // temporary counter long int Seqi; // current reference sequence index long int InitSize; // initial sequence memory space long int As = 0; // the size of the reference array long int Ac = 100; // the capacity of the reference array long int sA, sB, len; // current match start in A, B and length FILE * RefFile, * QryFile; // reference and query input files FILE * ClusterFile, * DeltaFile; // cluster and delta output files //-- Set the alignment data type and break length (sw_align.h) setMatrixType ( BLOSUM62 ); setBreakLen ( 60 ); //-- Parse the command line arguments { optarg = NULL; int ch, errflg = 0; while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) ) switch (ch) { case 'b' : setBreakLen (atoi (optarg)); break; case 'd' : DO_DELTA = false; break; case 'e' : DO_EXTEND = false; break; case 'h' : printHelp (argv[0]); exit (EXIT_SUCCESS); break; case 't' : TO_SEQEND = true; break; case 'x' : setMatrixType ( atoi (optarg) ); if ( getMatrixType( ) == NUCLEOTIDE ) { fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n", getMatrixType( )); setMatrixType ( BLOSUM62 ); } break; default : errflg ++; } if ( errflg > 0 || argc - optind != 3 ) { printUsage (argv[0]); exit (EXIT_FAILURE); } } //-- Read and create the I/O file names strcpy (RefFileName, argv[optind ++]); strcpy (QryFileName, argv[optind ++]); strcpy (ClusterFileName, argv[optind ++]); strcpy (DeltaFileName, ClusterFileName); strcat (ClusterFileName, ".cluster"); strcat (DeltaFileName, ".delta"); //-- Open all the files RefFile = File_Open (RefFileName, "r"); QryFile = File_Open (QryFileName, "r"); if ( DO_DELTA ) { ClusterFile = NULL; DeltaFile = File_Open (DeltaFileName, "w"); } else { ClusterFile = File_Open (ClusterFileName, "w"); DeltaFile = NULL; } //-- Print the headers of the output files if ( DO_DELTA ) fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName); else fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName); //-- Generate the array of the reference sequences InitSize = INIT_SIZE; Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) { Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); strcpy (Af[As].Id, IdA); Af[As].len = strlen (Af[As].seq + 1); if ( ++ As >= Ac ) { Ac *= 2; Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); } InitSize = INIT_SIZE; Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); } fclose (RefFile); if ( As <= 0 ) parseAbort ( RefFileName ); //-- Process the input from <stdin> line by line PrevLine = NO_LINE; IdA[0] = '\0'; IdB[0] = '\0'; FrameA = 0; FrameB = 0; CurrFrameA = -1; CurrFrameB = -1; CurrIdB[0] = '\0'; while ( fgets(Line, MAX_LINE, stdin) != NULL ) { //-- If the current line is a fasta HEADER_LINE if ( Line[0] == '>' ) { if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) parseAbort ("stdin"); sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB); CurrIdB [strlen(CurrIdB) - 2] = '\0'; PrevLine = HEADER_LINE; } //-- If the current line is a cluster HEADER_LINE else if ( Line[0] == '#' ) { PrevLine = HEADER_LINE; } //-- If the current line is a MATCH_LINE else { if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) parseAbort ("stdin"); //-- Re-map the reference coordinate back to its original sequence // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length // including the appended 'X' on each frame translation for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ ) sA -= refLen (Af[Seqi].len); assert ( Seqi < As ); //-- Get the correct frame, translate startA coordinate to frame for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ ) sA -= (Af[Seqi].len - (i % 3)) / 3 + 1; CurrFrameA = i + 1; assert ( CurrFrameA > 0 && CurrFrameA < 7 ); //-- If the match spans across a frame boundry if ( CurrFrameA < 1 || CurrFrameA > 6 || sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 || sA <= 0 ) { fprintf (stderr, "\nWARNING: A MUM was found extending beyond the boundry of:\n" " Reference sequence '>%s', frame %d\n" "Please file a bug report\n" "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA); continue; } //-- If the match spans across a sequence boundry if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 ) { fprintf (stderr, "\nWARNING: A MUM was found extending beyond the boundry of:\n" " Reference sequence '>%s'\n" "Please file a bug report\n" "Attempting to continue.\n", Af[Seqi].Id); continue; } //-- Check and update the current synteny region if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 || CurrFrameA != FrameA || CurrFrameB != FrameB ) { Found = false; if ( strcmp (IdB, CurrIdB) == 0 ) { //-- Has this header been seen before? for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) { if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) { assert ( Sp->AfP->len == Af[Seqi].len ); assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); CurrSp = Sp; Found = true; break; } } } else { //-- New B sequence header, process all the old synteny's processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); } strcpy (IdA, Af[Seqi].Id); strcpy (IdB, CurrIdB); FrameA = CurrFrameA; FrameB = CurrFrameB; //-- If not seen yet, create a new synteny region if ( ! Found ) { Asyn.AfP = Af + Seqi; Asyn.Bf.len = -1; Asyn.Bf.Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdB) + 1)); strcpy (Asyn.Bf.Id, IdB); Syntenys.push_back (Asyn); CurrSp = Syntenys.rbegin( ); } //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); // hack to remove empties Aclu.frameA = FrameA; Aclu.frameB = FrameB; Aclu.wasFused = false; CurrSp->clusters.push_back (Aclu); } else if ( PrevLine == HEADER_LINE ) { //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); Aclu.frameA = FrameA; Aclu.frameB = FrameB; Aclu.wasFused = false; CurrSp->clusters.push_back (Aclu); } //-- Add a new match to the current cluster // NOTE: A and B coordinates still reference the appropriate // amino acid sequence frame, not the DNA (same with len) if ( len > 1 ) { Amat.sA = sA; Amat.sB = sB; Amat.len = len; CurrSp->clusters.rbegin( )->matches.push_back (Amat); } PrevLine = MATCH_LINE; } } //-- Process the left-over syntenys if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); fclose (QryFile); //-- Free the reference sequences for ( i = 0; i < As; i ++ ) { free (Af[i].Id); free (Af[i].seq); } free (Af); return EXIT_SUCCESS; } void addNewAlignment (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, vector<Match>::iterator Mp) // Create and add a new alignment object based on the current match // and cluster information pointed to by Cp and Mp. { Alignment Align; //-- Initialize the data Align.sA = Mp->sA; Align.sB = Mp->sB; Align.eA = Mp->sA + Mp->len - 1; Align.eB = Mp->sB + Mp->len - 1; Align.frameA = Cp->frameA; Align.frameB = Cp->frameB; Align.delta.clear( ); Align.deltaApos = 0; //-- Add the alignment object Alignments.push_back (Align); return; } bool extendBackward (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, vector<Alignment>::iterator TargetAp, const char * A, const char * B) // Extend an alignment backwards off of the current alignment object. // The current alignment object must be freshly created and consist // only of an exact match (i.e. the delta vector MUST be empty). // If the TargetAp alignment object is reached by the extension, it will // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is // NULL the function will extend as far as possible. It is a strange // and dangerous function because it can delete CurrAp, so edit with // caution. Returns true if TargetAp was reached and merged, else false // Designed only as a subroutine for extendClusters, should be used // nowhere else. { bool target_reached = false; bool overflow_flag = false; bool double_flag = false; vector<long int>::iterator Dp; unsigned int m_o; long int targetA, targetB; m_o = BACKWARD_SEARCH; //-- Set the target coordinates if ( TargetAp != Alignments.end( ) ) { targetA = TargetAp->eA; targetB = TargetAp->eB; } else { targetA = 1; targetB = 1; m_o |= OPTIMAL_BIT; } //-- If alignment is too long, bring with bounds and set overflow_flag true if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( TO_SEQEND && !double_flag ) m_o |= SEQEND_BIT; //-- Attempt to reach the target target_reached = alignSearch (A, CurrAp->sA, targetA, B, CurrAp->sB, targetB, m_o); if ( overflow_flag || TargetAp == Alignments.end( ) ) target_reached = false; if ( target_reached ) { //-- assert that CurrAp is new and at the end of the Alignment vector assert ( CurrAp->delta.empty( ) ); assert ( CurrAp == Alignments.end( ) - 1 ); //-- Merge the two alignment objects extendForward (TargetAp, A, CurrAp->sA, B, CurrAp->sB, FORCED_FORWARD_ALIGN); TargetAp->eA = CurrAp->eA; TargetAp->eB = CurrAp->eB; Alignments.pop_back( ); } else { alignTarget (A, targetA, CurrAp->sA, B, targetB, CurrAp->sB, CurrAp->delta, FORCED_FORWARD_ALIGN); CurrAp->sA = targetA; CurrAp->sB = targetB; //-- Update the deltaApos value for the alignment object for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } return target_reached; } bool extendForward (vector<Alignment>::iterator CurrAp, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o) // Extend an alignment forwards off the current alignment object until // target or end of sequence is reached, and merge the delta values of the // alignment object with the new delta values generated by the extension. // Return true if the target was reached, else false { long int ValA; long int ValB; unsigned int Di; bool target_reached; bool overflow_flag = false; bool double_flag = false; vector<long int>::iterator Dp; //-- Set Di to the end of the delta vector Di = CurrAp->delta.size( ); //-- Calculate the distance between the start and end positions ValA = targetA - CurrAp->eA + 1; ValB = targetB - CurrAp->eB + 1; //-- If the distance is too long, shrink it and set the overflow_flag if ( ValA > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( ValB > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( double_flag ) m_o &= ~SEQEND_BIT; target_reached = alignTarget (A, CurrAp->eA, targetA, B, CurrAp->eB, targetB, CurrAp->delta, m_o); //-- Notify user if alignment was chopped short if ( target_reached && overflow_flag ) target_reached = false; //-- Pick up delta where left off (Di) and merge with new deltas if ( Di < CurrAp->delta.size( ) ) { //-- Merge the deltas ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); if ( CurrAp->delta[Di] == 0 || ValA < 0 ) { fprintf(stderr, "ERROR: failed to merge alignments at position %ld\n" " Please file a bug report\n", CurrAp->eA); exit (EXIT_FAILURE); } //-- Update the deltaApos for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } //-- Set the alignment coordinates CurrAp->eA = targetA; CurrAp->eB = targetB; return target_reached; } void extendClusters (vector<Cluster> & Clusters, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Connect all the matches in every cluster between sequences A and B. // Also, extend alignments off of the front and back of each cluster to // expand total alignment coverage. When these extensions encounter an // adjacent cluster (in a consistent frame), fuse the two regions to // create one single encompassing region. This routine will create // alignment objects from these extensions and output the resulting // delta information to the delta output file. Also, all coordintes // for the clusters and the alignment regions are translated to reference // the original DNA sequecne. { //-- Sort the clusters (ascending) by their start coordinate in sequence A sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); //-- If no delta file is requested if ( ! DO_DELTA ) return; bool target_reached = false; // reached the adjacent match or cluster // the amino acid sequences for A and B char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; long int Alen [7], Blen [7]; // the length of the amino acid sequences int i; unsigned int m_o; long int targetA, targetB; // alignment extension targets in A and B vector<Match>::iterator Mp; // match pointer vector<Cluster>::iterator PrevCp; // where the extensions last left off vector<Cluster>::iterator CurrCp; // the current cluster being extended vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster vector<Alignment> Alignments; // the vector of alignment objects vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align vector<Alignment>::iterator TargetAp; // target align //-- Calculate the length of each amino acid frame translation for ( i = 0; i < 6; i ++ ) { Alen [i + 1] = (Af->len - (i % 3)) / 3; Blen [i + 1] = (Bf->len - (i % 3)) / 3; } //-- Extend each cluster PrevCp = Clusters.begin( ); CurrCp = Clusters.begin( ); while ( CurrCp < Clusters.end( ) ) { if ( DO_EXTEND ) { //-- Ignore if shadowed or already extended if ( ! target_reached ) if ( CurrCp->wasFused || isShadowedCluster (CurrCp, Alignments, CurrAp) ) { CurrCp->wasFused = true; CurrCp = ++ PrevCp; continue; } } //-- Initialize the right amino acid sequence for A and B if need be if ( A [CurrCp->frameA] == NULL ) { A [CurrCp->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2) ); A [CurrCp->frameA] [0] = '\0'; Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA], CurrCp->frameA ); } if ( B [CurrCp->frameB] == NULL ) { B [CurrCp->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [CurrCp->frameB] [0] = '\0'; Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB], CurrCp->frameB ); } //-- Extend each match in the cluster for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) { //-- Try to extend the current match backwards if ( target_reached ) { //-- Merge with the previous match if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) { //-- Strip matches until the targeted match is found assert (Mp < CurrCp->matches.end( ) - 1); continue; } CurrAp->eA += Mp->len - 1; CurrAp->eB += Mp->len - 1; } else { //-- Create a new alignment object addNewAlignment (Alignments, CurrCp, Mp); CurrAp = Alignments.end( ) - 1; if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) { //-- Target the closest/best alignment object TargetAp = getReverseTargetAlignment (Alignments, CurrAp); //-- Extend the new alignment object backwards if ( extendBackward (Alignments, CurrAp, TargetAp, A [CurrCp->frameA], B [CurrCp->frameB]) ) CurrAp = TargetAp; } } m_o = FORWARD_ALIGN; //-- Try to extend the current match forwards if ( Mp < CurrCp->matches.end( ) - 1 ) { //-- Target the next match in the cluster targetA = (Mp + 1)->sA; targetB = (Mp + 1)->sB; //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A [CurrCp->frameA], targetA, B [CurrCp->frameB], targetB, m_o); } else if ( DO_EXTEND ) { targetA = Alen [CurrCp->frameA]; targetB = Blen [CurrCp->frameB]; //-- Target the closest/best match in a future cluster TargetCp = getForwardTargetCluster (Clusters, CurrCp, targetA, targetB); if ( TargetCp == Clusters.end( ) ) { m_o |= OPTIMAL_BIT; if ( TO_SEQEND ) m_o |= SEQEND_BIT; } //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A [CurrCp->frameA], targetA, B [CurrCp->frameB], targetB, m_o); } } if ( TargetCp == Clusters.end( ) ) target_reached = false; CurrCp->wasFused = true; if ( target_reached == false ) CurrCp = ++ PrevCp; else CurrCp = TargetCp; } #ifdef _DEBUG_ASSERT validateData (Alignments, Clusters, Af, Bf); #endif //-- Output the alignment data to the delta file flushAlignments (Alignments, Af, Bf, DeltaFile); for ( i = 0; i < 7; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); if ( B [i] != NULL ) free ( B [i] ); } return; } void flushAlignments (vector<Alignment> & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Simply output the delta information stored in Alignments to the // given delta file. Free the memory used by Alignments once the // data is successfully output to the file. { vector<Alignment>::iterator Ap; // alignment pointer vector<long int>::iterator Dp; // delta pointer fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); //-- Generate the error counts parseDelta (Alignments, Af, Bf); for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", transC (Ap->sA, Ap->frameA, Af->len), transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2), transC (Ap->sB, Ap->frameB, Bf->len), transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2), Ap->Errors, Ap->SimErrors, Ap->NonAlphas); for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) fprintf (DeltaFile, "%ld\n", *Dp); fprintf (DeltaFile, "0\n"); Ap->delta.clear( ); } Alignments.clear( ); return; } void flushSyntenys (vector<Synteny> & Syntenys, FILE * ClusterFile) // Simply output the synteny/cluster information generated by the mgaps // program. However, now the coordinates reference their appropriate // reference sequence, and the reference sequecne header is added to // the appropriate lines. Free the memory used by Syntenys once the // data is successfully output to the file. { vector<Synteny>::iterator Sp; // synteny pointer vector<Cluster>::iterator Cp; // cluster pointer vector<Match>::iterator Mp; // match pointer if ( ClusterFile ) { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, Sp->AfP->len, Sp->Bf.len); for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { fprintf (ClusterFile, "%2d %2d\n", Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA, Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB); for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { fprintf (ClusterFile, "%8ld %8ld %6ld", transC (Mp->sA, Cp->frameA, Sp->AfP->len), transC (Mp->sB, Cp->frameB, Sp->Bf.len), Mp->len * 3); if ( Mp != Cp->matches.begin( ) ) fprintf (ClusterFile, "%6ld %6ld\n", (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3, (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3); else fprintf (ClusterFile, "%6s %6s\n", "-", "-"); } Cp->matches.clear( ); } Sp->clusters.clear( ); } } else { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { Cp->matches.clear( ); } Sp->clusters.clear( ); } } Syntenys.clear( ); return; } vector<Cluster>::iterator getForwardTargetCluster (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, long int & targetA, long int & targetB) // Return the cluster that is most likely to successfully join (in a // forward direction) with the current cluster. The returned cluster // must contain 1 or more matches that are strictly greater than the end // of the current cluster. The targeted cluster must also be on a // diagonal close enough to the current cluster, so that a connection // could possibly be made by the alignment extender. Also, this targeted // cluster must be consistent in frame with the current cluster. Assumes // clusters have been sorted via AscendingClusterSort. Returns targeted // cluster and stores the target coordinates in targetA and targetB. If no // suitable cluster was found, the function will return NULL and target // A and targetB will remain unchanged. { vector<Match>::iterator Mip; // match iteratrive pointer vector<Cluster>::iterator Cp; // cluster pointer vector<Cluster>::iterator Cip; // cluster iterative pointer long int eA, eB; // possible target long int greater, lesser; // gap sizes between two clusters long int sA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster long int sB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster //-- End of sequences is the default target, set distance accordingly long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); //-- For all clusters greater than the current cluster (on sequence A) Cp = Clusters.end( ); for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) { //-- If the cluster is in the same frame if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB ) { eA = Cip->matches.begin( )->sA; eB = Cip->matches.begin( )->sB; //-- If the cluster overlaps the current cluster, strip some matches if ( ( eA < sA || eB < sB ) && Cip->matches.rbegin( )->sA >= sA && Cip->matches.rbegin( )->sB >= sB ) { for ( Mip = Cip->matches.begin( ); Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); Mip ++ ) { eA = Mip->sA; eB = Mip->sB; } } //-- If the cluster is strictly greater than current cluster if ( eA >= sA && eB >= sB ) { if ( eA - sA > eB - sB ) { greater = eA - sA; lesser = eB - sB; } else { lesser = eA - sA; greater = eB - sB; } //-- If the cluster is close enough if ( greater <= getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) { Cp = Cip; targetA = eA; targetB = eB; break; } else if ( (greater << 1) - lesser < dist ) { Cp = Cip; targetA = eA; targetB = eB; dist = (greater << 1) - lesser; } } } } return Cp; } vector<Alignment>::iterator getReverseTargetAlignment (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp) // Return the alignment that is most likely to successfully join (in a // reverse direction) with the current alignment. The returned alignment // must be strictly less than the current cluster and be on a diagonal // close enough to the current alignment, so that a connection // could possibly be made by the alignment extender. Also, this targeted // alignment must be consistent in frame with the current alignment. // Assumes clusters have been sorted via AscendingClusterSort and // processed in order, so therefore all alignments are in order by their // start A coordinate. { vector<Alignment>::iterator Ap; // alignment pointer vector<Alignment>::iterator Aip; // alignment iterative pointer long int eA, eB; // possible targets long int greater, lesser; // gap sizes between the two alignments long int sA = CurrAp->sA; // the startA of the current alignment long int sB = CurrAp->sB; // the startB of the current alignment //-- Beginning of sequences is the default target, set distance accordingly long int dist = (sA < sB ? sA : sB); //-- For all alignments less than the current alignment (on sequence A) Ap = Alignments.end( ); for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) { //-- If the alignment is on the same direction if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB ) { eA = Aip->eA; eB = Aip->eB; //-- If the alignment is strictly greater than current cluster if ( eA <= sA && eB <= sB ) { if ( sA - eA > sB - eB ) { greater = sA - eA; lesser = sB - eB; } else { lesser = sA - eA; greater = sB - eB; } //-- If the cluster is close enough if ( greater <= getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) { Ap = Aip; break; } else if ( (greater << 1) - lesser < dist ) { Ap = Aip; dist = (greater << 1) - lesser; } } } } return Ap; } bool isShadowedCluster (vector<Cluster>::iterator CurrCp, vector<Alignment> & Alignments, vector<Alignment>::iterator Ap) // Check if the current cluster is shadowed by a previously produced // alignment region. Return true if it is, else false. { vector<Alignment>::iterator Aip; // alignment pointer long int sA = CurrCp->matches.begin( )->sA; long int eA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; long int sB = CurrCp->matches.begin( )->sB; long int eB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; if ( ! Alignments.empty( ) ) // if there are alignments to use { //-- Look backwards in hope of finding a shadowing alignment for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) { //-- If in the same frames and shadowing the current cluster, break if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB ) if ( Aip->eA >= eA && Aip->eB >= eB ) if ( Aip->sA <= sA && Aip->sB <= sB ) break; } //-- Return true if the loop was broken, i.e. shadow found if ( Aip >= Alignments.begin( ) ) return true; } //-- Return false if Alignments was empty or loop was not broken return false; } void parseAbort (const char * s) // Abort the program if there was an error in parsing file 's' { fprintf (stderr, "ERROR: Could not parse input from '%s'. \n" "Please check the filename and format, or file a bug report\n", s); exit (EXIT_FAILURE); } void parseDelta (vector<Alignment> & Alignments, const FastaRecord * Af, const FastaRecord *Bf) // Use the delta information to generate the error counts for each // alignment, and fill this information into the data type { // pointers to Af.seq and Bf.seq char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char ch1, ch2; long int Delta; int Sign; long int i, Apos, Bpos; long int Remain, Total; long int Errors, SimErrors; long int NonAlphas; vector<Alignment>::iterator Ap; vector<long int>::iterator Dp; for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( A [Ap->frameA] == NULL ) { A [Ap->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2 ) ); A [Ap->frameA] [0] = '\0'; Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA ); } if ( B [Ap->frameB] == NULL ) { B [Ap->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [Ap->frameB] [0] = '\0'; Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB ); } Apos = Ap->sA; Bpos = Ap->sB; Errors = 0; SimErrors = 0; NonAlphas = 0; Remain = Ap->eA - Ap->sA + 1; Total = Remain; //-- For all delta's in this alignment for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) { Delta = *Dp; Sign = Delta > 0 ? 1 : -1; Delta = labs ( Delta ); //-- For all the bases before the next indel for ( i = 1; i < Delta; i ++ ) { ch1 = A [Ap->frameA] [Apos ++]; ch2 = B [Ap->frameB] [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } //-- Process the current indel Remain -= i - 1; Errors ++; SimErrors ++; if ( Sign == 1 ) { if ( !isalpha (A [Ap->frameA] [Apos ++]) ) NonAlphas ++; Remain --; } else { if ( !isalpha (B [Ap->frameB] [Bpos ++]) ) NonAlphas ++; Total ++; } } //-- For all the bases after the final indel for ( i = 0; i < Remain; i ++ ) { //-- Score character match and update error counters ch1 = A [Ap->frameA] [Apos ++]; ch2 = B [Ap->frameB] [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } Ap->Errors = Errors; Ap->SimErrors = SimErrors; Ap->NonAlphas = NonAlphas; } for ( i = 1; i <= 6; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); A [i] = NULL; if ( B [i] != NULL ) free ( B [i] ); B [i] = NULL; } return; } void processSyntenys (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) // For each syntenic region with clusters, read in the B sequence and // extend the clusters to expand total alignment coverage. Clusters should // still reference the amino acid concatenations, the translation back to // DNA will be taken care of by the extendClusters function. // processSyntenys should ONLY be called once all the clusters for // the contained syntenic regions have been stored in the data structure. // Frees the memory used by the the syntenic regions once the output of // extendClusters and flushSyntenys has been produced. { FastaRecord Bf; // the current B sequence Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); long int InitSize = INIT_SIZE; // the initial memory capacity of B vector<Synteny>::iterator CurrSp; // current synteny pointer //-- Initialize the B sequence storage Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); //-- For all the contained syntenys for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) { //-- If no clusters, ignore if ( CurrSp->clusters.empty( ) ) continue; //-- If a B sequence not seen yet, read it in //-- IMPORTANT: The B sequences in the synteny object are assumed to be // ordered as output by mgaps, if they are not in order the program // will fail. (All like tags must be adjacent and in the same order // as the query file) if ( CurrSp == Syntenys.begin( ) || strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) { //-- Read in the B sequence while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) break; if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) { fprintf(stderr,"%s\n", CurrSp->Bf.Id); parseAbort ( "Query File" ); } Bf.len = strlen (Bf.seq + 1); } //-- Extend clusters and create the alignment information CurrSp->Bf.len = Bf.len; extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); } //-- Create the cluster information and clear the Synteny structure flushSyntenys (Syntenys, ClusterFile); free (Bf.Id); free (Bf.seq); return; } inline long int transC (long int Coord, int Frame, long int Len) // Translate an amino acid coordinate to a nucleotide coordinate // relative to the forward strand. The Len parameter should be the // length of the original DNA strand. { assert ( Frame > 0 && Frame < 7 ); if ( Frame > 3 ) return revC ( (Coord * 3) - (3 - (Frame - 3)), Len ); else return (Coord * 3) - (3 - (Frame)); } inline long int refLen (long int ntLen) // Return the length of the concatenated amino acid sequence frames, // including the appended 'X' at the end of each sequence frame for the // given DNA input length. (The returned length will be the length of // the concatenated frames for this sequence as output by prepro.cc) { return (ntLen + 1) << 1; } inline long int revC (long int Coord, long int Len) // Reverse complement the given coordinate for the given length. { assert (Len - Coord + 1 > 0); return (Len - Coord + 1); } void printHelp (const char * s) // Display the program's help information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); fprintf (stderr, "-b int set the alignment break (give-up) length to int (amino acids)\n" "-d output only match clusters rather than extended alignments\n" "-e do not extend alignments outward from clusters\n" "-h display help information\n" "-t force alignment to ends of sequence if within -b distance\n" "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n" " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n"); fprintf (stderr, " Input is the output of the \"mgaps\" program from stdin, and\n" "the two original PROmer sequence files passed on the command\n" "line. <pfx> is the prefix to be added to the front of the\n" "output file <pfx>.delta\n" " <pfx>.delta is the alignment object that catalogs the distance\n" "between insertions and deletions. For further information\n" "regarding this file, please refer to the documentation under\n" "the .delta output description.\n\n"); return; } void printUsage (const char * s) // Display the program's usage information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); fprintf (stderr, "Try '%s -h' for more information.\n", s); return; } void validateData (vector<Alignment> Alignments, vector<Cluster> Clusters, const FastaRecord * Af, const FastaRecord * Bf) // Self test function to check that the cluster and alignment information // is valid { char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; long int Alen [7], Blen [7]; // the length of the amino acid sequences vector<Cluster>::iterator Cp; vector<Match>::iterator Mp; vector<Alignment>::iterator Ap; long int x, y, i; char Xc, Yc; for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) { assert ( Cp->wasFused ); //-- Initialize the right amino acid sequence for A and B if need be if ( A [Cp->frameA] == NULL ) { A [Cp->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2) ); A [Cp->frameA] [0] = '\0'; Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA], Cp->frameA ); } if ( B [Cp->frameB] == NULL ) { B [Cp->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [Cp->frameB] [0] = '\0'; Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB], Cp->frameB ); } for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { //-- assert for each match in cluster, it is indeed a match x = Mp->sA; y = Mp->sB; for ( i = 0; i < Mp->len; i ++ ) assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] ); //-- assert for each match in cluster, it is contained in an alignment for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && Ap->eA >= Mp->sA + Mp->len - 1 && Ap->eB >= Mp->sB + Mp->len - 1 ) break; } assert ( Ap < Alignments.end( ) ); } } //-- assert alignments are optimal (quick check if first and last chars equal) for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { assert ( Ap->sA <= Ap->eA ); assert ( Ap->sB <= Ap->eB ); assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ? A[Ap->frameA][Ap->sA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ? B[Ap->frameB][Ap->sB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ? A[Ap->frameA][Ap->eA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ? B[Ap->frameB][Ap->eB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); } for ( i = 0; i < 7; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); if ( B [i] != NULL ) free ( B [i] ); } return; }