Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-aligns.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-aligns.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,635 @@ +//------------------------------------------------------------------------------ +// Programmer: Adam M Phillippy, The Institute for Genomic Research +// File: show-aligns.cc +// Date: 10 / 18 / 2002 +// +// Usage: show-aligns [options] <deltafile> +// Try 'show-aligns -h' for more information +// +// Description: For use in conjunction with the MUMmer package. +// "show-aligns" displays human readable information from the +// .delta output of the "nucmer" and "promer" programs. Outputs +// pairwise alignments to stdout. Works for both nucleotide and +// amino-acid alignments. +// +//------------------------------------------------------------------------------ + +#include "delta.hh" +#include "tigrinc.hh" +#include "translate.hh" +#include "sw_alignscore.hh" +#include <vector> +#include <algorithm> +using namespace std; + +//------------------------------------------------------------- Constants ----// + +const char NUCMER_MISMATCH_CHAR = '^'; +const char NUCMER_MATCH_CHAR = ' '; +const char PROMER_SIM_CHAR = '+'; +const char PROMER_MISMATCH_CHAR = ' '; + +//-- Note: if coord exceeds LINE_PREFIX_LEN - 1 digits, +// increase these accordingly +#define LINE_PREFIX_LEN 11 +#define PREFIX_FORMAT "%-10ld " + +#define DEFAULT_SCREEN_WIDTH 60 +int Screen_Width = DEFAULT_SCREEN_WIDTH; + + + +//------------------------------------------------------ Type Definitions ----// +struct AlignStats + //-- Alignment statistics data structure +{ + long int sQ, eQ, sR, eR; // start and end in Query and Reference + // relative to the directional strand + vector<long int> Delta; // delta information +}; + + + +struct sR_Sort +//-- For sorting alignments by their sR coordinate +{ + bool operator( ) (const AlignStats & pA, const AlignStats & pB) + { + //-- sort sR + if ( pA.sR < pB.sR ) + return true; + else + return false; + } +}; + + + +struct sQ_Sort +//-- For sorting alignments by their sQ coordinate +{ + bool operator( ) (const AlignStats & pA, const AlignStats & pB) + { + //-- sort sQ + if ( pA.sQ < pB.sQ ) + return true; + else + return false; + } +}; + + + + +//------------------------------------------------------ Global Variables ----// +bool isSortByQuery = false; // -q option +bool isSortByReference = false; // -r option + +int DATA_TYPE = NUCMER_DATA; +int MATRIX_TYPE = BLOSUM62; + +char InputFileName [MAX_LINE]; +char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; + + + +//------------------------------------------------- Function Declarations ----// +long int toFwd + (long int coord, long int len, int frame); + +void parseDelta + (vector<AlignStats> & Aligns, char * IdR, char * IdQ); + +void printAlignments + (vector<AlignStats> Aligns, char * R, char * Q); + +void printHelp + (const char * s); + +void printUsage + (const char * s); + +long int revC + (long int coord, long int len); + + + +//-------------------------------------------------- Function Definitions ----// +int main + (int argc, char ** argv) +{ + long int i; + + FILE * RefFile = NULL; + FILE * QryFile = NULL; + + vector<AlignStats> Aligns; + + char * R, * Q; + + long int InitSize = INIT_SIZE; + char Id [MAX_LINE], IdR [MAX_LINE], IdQ [MAX_LINE]; + + //-- Parse the command line arguments + { + int ch, errflg = 0; + optarg = NULL; + + while ( !errflg && ((ch = getopt + (argc, argv, "hqrw:x:")) != EOF) ) + switch (ch) + { + case 'h' : + printHelp (argv[0]); + exit (EXIT_SUCCESS); + break; + + case 'q' : + isSortByQuery = true; + break; + + case 'r' : + isSortByReference = true; + break; + + case 'w' : + Screen_Width = atoi (optarg); + if ( Screen_Width <= LINE_PREFIX_LEN ) + { + fprintf(stderr, + "WARNING: invalid screen width %d, using default\n", + DEFAULT_SCREEN_WIDTH); + Screen_Width = DEFAULT_SCREEN_WIDTH; + } + break; + + case 'x' : + MATRIX_TYPE = atoi (optarg); + if ( MATRIX_TYPE < 1 || MATRIX_TYPE > 3 ) + { + fprintf(stderr, + "WARNING: invalid matrix type %d, using default\n", + MATRIX_TYPE); + MATRIX_TYPE = BLOSUM62; + } + break; + + default : + errflg ++; + } + + if ( errflg > 0 || argc - optind != 3 ) + { + printUsage (argv[0]); + exit (EXIT_FAILURE); + } + + if ( isSortByQuery && isSortByReference ) + fprintf (stderr, + "WARNING: both -r and -q were passed, -q ignored\n"); + } + + strcpy (InputFileName, argv[optind ++]); + strcpy (IdR, argv[optind ++]); + strcpy (IdQ, argv[optind ++]); + + //-- Read in the alignment data + parseDelta (Aligns, IdR, IdQ); + + //-- Find, and read in the reference sequence + RefFile = File_Open (RefFileName, "r"); + InitSize = INIT_SIZE; + R = (char *) Safe_malloc ( sizeof(char) * InitSize ); + while ( Read_String (RefFile, R, InitSize, Id, FALSE) ) + if ( strcmp (Id, IdR) == 0 ) + break; + fclose (RefFile); + if ( strcmp (Id, IdR) != 0 ) + { + fprintf(stderr,"ERROR: Could not find %s in the reference file\n", IdR); + exit (EXIT_FAILURE); + } + + + //-- Find, and read in the query sequence + QryFile = File_Open (QryFileName, "r"); + InitSize = INIT_SIZE; + Q = (char *) Safe_malloc ( sizeof(char) * InitSize ); + while ( Read_String (QryFile, Q, InitSize, Id, FALSE) ) + if ( strcmp (Id, IdQ) == 0 ) + break; + fclose (QryFile); + if ( strcmp (Id, IdQ) != 0 ) + { + fprintf(stderr,"ERROR: Could not find %s in the query file\n", IdQ); + exit (EXIT_FAILURE); + } + + //-- Sort the alignment regions if user passed -r or -q option + if ( isSortByReference ) + sort (Aligns.begin( ), Aligns.end( ), sR_Sort( )); + else if ( isSortByQuery ) + sort (Aligns.begin( ), Aligns.end( ), sQ_Sort( )); + + + //-- Output the alignments to stdout + printf("%s %s\n\n", RefFileName, QryFileName); + for ( i = 0; i < Screen_Width; i ++ ) printf("="); + printf("\n-- Alignments between %s and %s\n\n", IdR, IdQ); + printAlignments (Aligns, R, Q); + printf("\n"); + for ( i = 0; i < Screen_Width; i ++ ) printf("="); + printf("\n"); + + return EXIT_SUCCESS; +} + + + + +long int toFwd + (long int coord, long int len, int frame) + + // Switch relative coordinate to reference forward DNA strand + +{ + long int newc = coord; + + if ( DATA_TYPE == PROMER_DATA ) + newc = newc * 3 - (3 - labs(frame)); + + if ( frame < 0 ) + return revC ( newc, len ); + else + return newc; +} + + + + +void parseDelta + (vector<AlignStats> & Aligns, char * IdR, char * IdQ) + + // Read in the alignments from the desired region + +{ + AlignStats aStats; // single alignment region + bool found = false; + + DeltaReader_t dr; + dr.open (InputFileName); + DATA_TYPE = dr.getDataType( ) == NUCMER_STRING ? + NUCMER_DATA : PROMER_DATA; + strcpy (RefFileName, dr.getReferencePath( ).c_str( )); + strcpy (QryFileName, dr.getQueryPath( ).c_str( )); + + while ( dr.readNext( ) ) + { + if ( dr.getRecord( ).idR == IdR && + dr.getRecord( ).idQ == IdQ ) + { + found = true; + break; + } + } + if ( !found ) + { + fprintf(stderr, "ERROR: Could not find any alignments for %s and %s\n", + IdR, IdQ); + exit (EXIT_FAILURE); + } + + for ( unsigned int i = 0; i < dr.getRecord( ).aligns.size( ); i ++ ) + { + aStats.sR = dr.getRecord( ).aligns[i].sR; + aStats.eR = dr.getRecord( ).aligns[i].eR; + aStats.sQ = dr.getRecord( ).aligns[i].sQ; + aStats.eQ = dr.getRecord( ).aligns[i].eQ; + + aStats.Delta = dr.getRecord( ).aligns[i].deltas; + + //-- Add the new alignment + Aligns.push_back (aStats); + } + dr.close( ); + + return; +} + + + + +void printAlignments + (vector<AlignStats> Aligns, char * R, char * Q) + + // Print the alignments to the screen + +{ + vector<AlignStats>::iterator Ap; + vector<long int>::iterator Dp; + + char * A[7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + char * B[7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; + int Ai, Bi, i; + + char Buff1 [Screen_Width + 1], + Buff2 [Screen_Width + 1], + Buff3 [Screen_Width + 1]; + + int Sign; + long int Delta; + long int Total, Errors, Remain; + long int Pos; + + long int sR, eR, sQ, eQ; + long int Apos, Bpos; + long int SeqLenR, SeqLenQ; + int frameR, frameQ; + + for ( i = 0; i < LINE_PREFIX_LEN; i ++ ) + Buff3[i] = ' '; + + SeqLenR = strlen (R + 1); + SeqLenQ = strlen (Q + 1); + + if ( DATA_TYPE == NUCMER_DATA ) + { + A[1] = R; + A[4] = (char *) Safe_malloc ( sizeof(char) * (SeqLenR + 2) ); + strcpy ( A[4] + 1, A[1] + 1 ); + A[4][0] = '\0'; + Reverse_Complement ( A[4], 1, SeqLenR ); + + B[1] = Q; + B[4] = (char *) Safe_malloc ( sizeof(char) * (SeqLenQ + 2) ); + strcpy ( B[4] + 1, B[1] + 1 ); + B[4][0] = '\0'; + Reverse_Complement ( B[4], 1, SeqLenQ ); + } + + for ( Ap = Aligns.begin( ); Ap < Aligns.end( ); Ap ++ ) + { + sR = Ap->sR; + eR = Ap->eR; + sQ = Ap->sQ; + eQ = Ap->eQ; + + //-- Get the coords and frame right + frameR = 1; + if ( sR > eR ) + { + sR = revC (sR, SeqLenR); + eR = revC (eR, SeqLenR); + frameR += 3; + } + frameQ = 1; + if ( sQ > eQ ) + { + sQ = revC (sQ, SeqLenQ); + eQ = revC (eQ, SeqLenQ); + frameQ += 3; + } + + if ( DATA_TYPE == PROMER_DATA ) + { + frameR += (sR + 2) % 3; + frameQ += (sQ + 2) % 3; + + //-- Translate the coordinates from DNA to Amino Acid + // remeber that eR and eQ point to the last nucleotide in the codon + sR = (sR + 2) / 3; + eR = eR / 3; + sQ = (sQ + 2) / 3; + eQ = eQ / 3; + } + Ai = frameR; + Bi = frameQ; + if ( frameR > 3 ) + frameR = -(frameR - 3); + if ( frameQ > 3 ) + frameQ = -(frameQ - 3); + + //-- Get the sequence + if ( A[Ai] == NULL ) + { + assert ( DATA_TYPE == PROMER_DATA ); + A[Ai] = (char *) Safe_malloc ( sizeof(char) * ( SeqLenR / 3 + 2 ) ); + A[Ai][0] = '\0'; + Translate_DNA ( R, A[Ai], Ai ); + } + if ( B[Bi] == NULL ) + { + assert ( DATA_TYPE == PROMER_DATA ); + B[Bi] = (char *) Safe_malloc ( sizeof(char) * ( SeqLenQ / 3 + 2 ) ); + B[Bi][0] = '\0'; + Translate_DNA ( Q, B[Bi], Bi ); + } + + + //-- Generate the alignment + printf("-- BEGIN alignment [ %s%d %ld - %ld | %s%d %ld - %ld ]\n\n\n", + frameR > 0 ? "+" : "-", abs(frameR), Ap->sR, Ap->eR, + frameQ > 0 ? "+" : "-", abs(frameQ), Ap->sQ, Ap->eQ); + + Apos = sR; + Bpos = sQ; + + Errors = 0; + Total = 0; + Remain = eR - sR + 1; + + sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR)); + sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ)); + Pos = LINE_PREFIX_LEN; + + for ( Dp = Ap->Delta.begin( ); + Dp < Ap->Delta.end( ) && + *Dp != 0; Dp ++ ) + { + Delta = *Dp; + Sign = Delta > 0 ? 1 : -1; + Delta = labs ( Delta ); + + + //-- For all the bases before the next indel + for ( i = 1; i < Delta; i ++ ) + { + if ( Pos >= Screen_Width ) + { + Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0'; + if ( DATA_TYPE == NUCMER_DATA ) + printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3); + else + printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2); + sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR)); + sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ)); + Pos = LINE_PREFIX_LEN; + } + + if ( DATA_TYPE == NUCMER_DATA ) + Buff3[Pos] = A[Ai][Apos] == B[Bi][Bpos] ? + NUCMER_MATCH_CHAR : NUCMER_MISMATCH_CHAR; + else if ( A[Ai][Apos] == B[Bi][Bpos] ) + Buff3[Pos] = A[Ai][Apos]; + else + Buff3[Pos] = MATCH_SCORE + [MATRIX_TYPE] + [toupper(A[Ai][Apos]) - 'A'] + [toupper(B[Bi][Bpos]) - 'A'] > 0 ? + PROMER_SIM_CHAR : PROMER_MISMATCH_CHAR; + Buff1[Pos] = A[Ai][Apos ++]; + Buff2[Pos ++] = B[Bi][Bpos ++]; + } + + + //-- For the indel + Remain -= i - 1; + + if ( Pos >= Screen_Width ) + { + Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0'; + if ( DATA_TYPE == NUCMER_DATA ) + printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3); + else + printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2); + sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR)); + sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ)); + Pos = LINE_PREFIX_LEN; + } + + if ( Sign == 1 ) + { + if ( DATA_TYPE == NUCMER_DATA ) + Buff3[Pos] = NUCMER_MISMATCH_CHAR; + else + Buff3[Pos] = PROMER_MISMATCH_CHAR; + Buff1[Pos] = A[Ai][Apos ++]; + Buff2[Pos ++] = '.'; + Remain --; + } + else + { + if ( DATA_TYPE == NUCMER_DATA ) + Buff3[Pos] = NUCMER_MISMATCH_CHAR; + else + Buff3[Pos] = PROMER_MISMATCH_CHAR; + Buff1[Pos] = '.'; + Buff2[Pos ++] = B[Bi][Bpos ++]; + Total ++; + } + } + + + //-- For all the bases remaining after the last indel + for ( i = 0; i < Remain; i ++ ) + { + if ( Pos >= Screen_Width ) + { + Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0'; + if ( DATA_TYPE == NUCMER_DATA ) + printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3); + else + printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2); + sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR)); + sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ)); + Pos = LINE_PREFIX_LEN; + } + + if ( DATA_TYPE == NUCMER_DATA ) + Buff3[Pos] = A[Ai][Apos] == B[Bi][Bpos] ? + NUCMER_MATCH_CHAR : NUCMER_MISMATCH_CHAR; + else if ( A[Ai][Apos] == B[Bi][Bpos] ) + Buff3[Pos] = A[Ai][Apos]; + else + Buff3[Pos] = MATCH_SCORE + [MATRIX_TYPE] + [toupper(A[Ai][Apos]) - 'A'] + [toupper(B[Bi][Bpos]) - 'A'] > 0 ? + PROMER_SIM_CHAR : PROMER_MISMATCH_CHAR; + Buff1[Pos] = A[Ai][Apos ++]; + Buff2[Pos ++] = B[Bi][Bpos ++]; + } + + + //-- For the remaining buffered output + if ( Pos > LINE_PREFIX_LEN ) + { + Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0'; + if ( DATA_TYPE == NUCMER_DATA ) + printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3); + else + printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2); + sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR)); + sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ)); + Pos = LINE_PREFIX_LEN; + } + + printf("\n-- END alignment [ %s%d %ld - %ld | %s%d %ld - %ld ]\n", + frameR > 0 ? "+" : "-", abs(frameR), Ap->sR, Ap->eR, + frameQ > 0 ? "+" : "-", abs(frameQ), Ap->sQ, Ap->eQ); + } + + //-- Free the sequences, except for the originals + for ( i = 0; i < 7; i ++ ) + { + if ( (DATA_TYPE != NUCMER_DATA || i != 1) && A[i] != NULL ) + free ( A[i] ); + if ( (DATA_TYPE != NUCMER_DATA || i != 1) && B[i] != NULL ) + free ( B[i] ); + } + + return; +} + + + + +void printHelp + (const char * s) + + // Display the program's help information to stderr + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <deltafile> <ref ID> <qry ID>\n\n", s); + fprintf (stderr, + "-h Display help information\n" + "-q Sort alignments by the query start coordinate\n" + "-r Sort alignments by the reference start coordinate\n" + "-w int Set the screen width - default is 60\n" + "-x int Set the matrix type - default is 2 (BLOSUM 62),\n" + " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n" + " note: only has effect on amino acid alignments\n\n"); + fprintf (stderr, + " Input is the .delta output of either the \"nucmer\" or the\n" + "\"promer\" program passed on the command line.\n" + " Output is to stdout, and consists of all the alignments between the\n" + "query and reference sequences identified on the command line.\n" + " NOTE: No sorting is done by default, therefore the alignments\n" + "will be ordered as found in the <deltafile> input.\n\n"); + return; +} + + + + +void printUsage + (const char * s) + + // Display the program's usage information to stderr. + +{ + fprintf (stderr, + "\nUSAGE: %s [options] <deltafile> <ref ID> <qry ID>\n\n", s); + fprintf (stderr, "Try '%s -h' for more information.\n", s); + return; +} + + + + +long int revC + (long int coord, long int len) +{ + return len - coord + 1; +}