comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-aligns.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 //------------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
3 // File: show-aligns.cc
4 // Date: 10 / 18 / 2002
5 //
6 // Usage: show-aligns [options] <deltafile>
7 // Try 'show-aligns -h' for more information
8 //
9 // Description: For use in conjunction with the MUMmer package.
10 // "show-aligns" displays human readable information from the
11 // .delta output of the "nucmer" and "promer" programs. Outputs
12 // pairwise alignments to stdout. Works for both nucleotide and
13 // amino-acid alignments.
14 //
15 //------------------------------------------------------------------------------
16
17 #include "delta.hh"
18 #include "tigrinc.hh"
19 #include "translate.hh"
20 #include "sw_alignscore.hh"
21 #include <vector>
22 #include <algorithm>
23 using namespace std;
24
25 //------------------------------------------------------------- Constants ----//
26
27 const char NUCMER_MISMATCH_CHAR = '^';
28 const char NUCMER_MATCH_CHAR = ' ';
29 const char PROMER_SIM_CHAR = '+';
30 const char PROMER_MISMATCH_CHAR = ' ';
31
32 //-- Note: if coord exceeds LINE_PREFIX_LEN - 1 digits,
33 // increase these accordingly
34 #define LINE_PREFIX_LEN 11
35 #define PREFIX_FORMAT "%-10ld "
36
37 #define DEFAULT_SCREEN_WIDTH 60
38 int Screen_Width = DEFAULT_SCREEN_WIDTH;
39
40
41
42 //------------------------------------------------------ Type Definitions ----//
43 struct AlignStats
44 //-- Alignment statistics data structure
45 {
46 long int sQ, eQ, sR, eR; // start and end in Query and Reference
47 // relative to the directional strand
48 vector<long int> Delta; // delta information
49 };
50
51
52
53 struct sR_Sort
54 //-- For sorting alignments by their sR coordinate
55 {
56 bool operator( ) (const AlignStats & pA, const AlignStats & pB)
57 {
58 //-- sort sR
59 if ( pA.sR < pB.sR )
60 return true;
61 else
62 return false;
63 }
64 };
65
66
67
68 struct sQ_Sort
69 //-- For sorting alignments by their sQ coordinate
70 {
71 bool operator( ) (const AlignStats & pA, const AlignStats & pB)
72 {
73 //-- sort sQ
74 if ( pA.sQ < pB.sQ )
75 return true;
76 else
77 return false;
78 }
79 };
80
81
82
83
84 //------------------------------------------------------ Global Variables ----//
85 bool isSortByQuery = false; // -q option
86 bool isSortByReference = false; // -r option
87
88 int DATA_TYPE = NUCMER_DATA;
89 int MATRIX_TYPE = BLOSUM62;
90
91 char InputFileName [MAX_LINE];
92 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
93
94
95
96 //------------------------------------------------- Function Declarations ----//
97 long int toFwd
98 (long int coord, long int len, int frame);
99
100 void parseDelta
101 (vector<AlignStats> & Aligns, char * IdR, char * IdQ);
102
103 void printAlignments
104 (vector<AlignStats> Aligns, char * R, char * Q);
105
106 void printHelp
107 (const char * s);
108
109 void printUsage
110 (const char * s);
111
112 long int revC
113 (long int coord, long int len);
114
115
116
117 //-------------------------------------------------- Function Definitions ----//
118 int main
119 (int argc, char ** argv)
120 {
121 long int i;
122
123 FILE * RefFile = NULL;
124 FILE * QryFile = NULL;
125
126 vector<AlignStats> Aligns;
127
128 char * R, * Q;
129
130 long int InitSize = INIT_SIZE;
131 char Id [MAX_LINE], IdR [MAX_LINE], IdQ [MAX_LINE];
132
133 //-- Parse the command line arguments
134 {
135 int ch, errflg = 0;
136 optarg = NULL;
137
138 while ( !errflg && ((ch = getopt
139 (argc, argv, "hqrw:x:")) != EOF) )
140 switch (ch)
141 {
142 case 'h' :
143 printHelp (argv[0]);
144 exit (EXIT_SUCCESS);
145 break;
146
147 case 'q' :
148 isSortByQuery = true;
149 break;
150
151 case 'r' :
152 isSortByReference = true;
153 break;
154
155 case 'w' :
156 Screen_Width = atoi (optarg);
157 if ( Screen_Width <= LINE_PREFIX_LEN )
158 {
159 fprintf(stderr,
160 "WARNING: invalid screen width %d, using default\n",
161 DEFAULT_SCREEN_WIDTH);
162 Screen_Width = DEFAULT_SCREEN_WIDTH;
163 }
164 break;
165
166 case 'x' :
167 MATRIX_TYPE = atoi (optarg);
168 if ( MATRIX_TYPE < 1 || MATRIX_TYPE > 3 )
169 {
170 fprintf(stderr,
171 "WARNING: invalid matrix type %d, using default\n",
172 MATRIX_TYPE);
173 MATRIX_TYPE = BLOSUM62;
174 }
175 break;
176
177 default :
178 errflg ++;
179 }
180
181 if ( errflg > 0 || argc - optind != 3 )
182 {
183 printUsage (argv[0]);
184 exit (EXIT_FAILURE);
185 }
186
187 if ( isSortByQuery && isSortByReference )
188 fprintf (stderr,
189 "WARNING: both -r and -q were passed, -q ignored\n");
190 }
191
192 strcpy (InputFileName, argv[optind ++]);
193 strcpy (IdR, argv[optind ++]);
194 strcpy (IdQ, argv[optind ++]);
195
196 //-- Read in the alignment data
197 parseDelta (Aligns, IdR, IdQ);
198
199 //-- Find, and read in the reference sequence
200 RefFile = File_Open (RefFileName, "r");
201 InitSize = INIT_SIZE;
202 R = (char *) Safe_malloc ( sizeof(char) * InitSize );
203 while ( Read_String (RefFile, R, InitSize, Id, FALSE) )
204 if ( strcmp (Id, IdR) == 0 )
205 break;
206 fclose (RefFile);
207 if ( strcmp (Id, IdR) != 0 )
208 {
209 fprintf(stderr,"ERROR: Could not find %s in the reference file\n", IdR);
210 exit (EXIT_FAILURE);
211 }
212
213
214 //-- Find, and read in the query sequence
215 QryFile = File_Open (QryFileName, "r");
216 InitSize = INIT_SIZE;
217 Q = (char *) Safe_malloc ( sizeof(char) * InitSize );
218 while ( Read_String (QryFile, Q, InitSize, Id, FALSE) )
219 if ( strcmp (Id, IdQ) == 0 )
220 break;
221 fclose (QryFile);
222 if ( strcmp (Id, IdQ) != 0 )
223 {
224 fprintf(stderr,"ERROR: Could not find %s in the query file\n", IdQ);
225 exit (EXIT_FAILURE);
226 }
227
228 //-- Sort the alignment regions if user passed -r or -q option
229 if ( isSortByReference )
230 sort (Aligns.begin( ), Aligns.end( ), sR_Sort( ));
231 else if ( isSortByQuery )
232 sort (Aligns.begin( ), Aligns.end( ), sQ_Sort( ));
233
234
235 //-- Output the alignments to stdout
236 printf("%s %s\n\n", RefFileName, QryFileName);
237 for ( i = 0; i < Screen_Width; i ++ ) printf("=");
238 printf("\n-- Alignments between %s and %s\n\n", IdR, IdQ);
239 printAlignments (Aligns, R, Q);
240 printf("\n");
241 for ( i = 0; i < Screen_Width; i ++ ) printf("=");
242 printf("\n");
243
244 return EXIT_SUCCESS;
245 }
246
247
248
249
250 long int toFwd
251 (long int coord, long int len, int frame)
252
253 // Switch relative coordinate to reference forward DNA strand
254
255 {
256 long int newc = coord;
257
258 if ( DATA_TYPE == PROMER_DATA )
259 newc = newc * 3 - (3 - labs(frame));
260
261 if ( frame < 0 )
262 return revC ( newc, len );
263 else
264 return newc;
265 }
266
267
268
269
270 void parseDelta
271 (vector<AlignStats> & Aligns, char * IdR, char * IdQ)
272
273 // Read in the alignments from the desired region
274
275 {
276 AlignStats aStats; // single alignment region
277 bool found = false;
278
279 DeltaReader_t dr;
280 dr.open (InputFileName);
281 DATA_TYPE = dr.getDataType( ) == NUCMER_STRING ?
282 NUCMER_DATA : PROMER_DATA;
283 strcpy (RefFileName, dr.getReferencePath( ).c_str( ));
284 strcpy (QryFileName, dr.getQueryPath( ).c_str( ));
285
286 while ( dr.readNext( ) )
287 {
288 if ( dr.getRecord( ).idR == IdR &&
289 dr.getRecord( ).idQ == IdQ )
290 {
291 found = true;
292 break;
293 }
294 }
295 if ( !found )
296 {
297 fprintf(stderr, "ERROR: Could not find any alignments for %s and %s\n",
298 IdR, IdQ);
299 exit (EXIT_FAILURE);
300 }
301
302 for ( unsigned int i = 0; i < dr.getRecord( ).aligns.size( ); i ++ )
303 {
304 aStats.sR = dr.getRecord( ).aligns[i].sR;
305 aStats.eR = dr.getRecord( ).aligns[i].eR;
306 aStats.sQ = dr.getRecord( ).aligns[i].sQ;
307 aStats.eQ = dr.getRecord( ).aligns[i].eQ;
308
309 aStats.Delta = dr.getRecord( ).aligns[i].deltas;
310
311 //-- Add the new alignment
312 Aligns.push_back (aStats);
313 }
314 dr.close( );
315
316 return;
317 }
318
319
320
321
322 void printAlignments
323 (vector<AlignStats> Aligns, char * R, char * Q)
324
325 // Print the alignments to the screen
326
327 {
328 vector<AlignStats>::iterator Ap;
329 vector<long int>::iterator Dp;
330
331 char * A[7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
332 char * B[7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
333 int Ai, Bi, i;
334
335 char Buff1 [Screen_Width + 1],
336 Buff2 [Screen_Width + 1],
337 Buff3 [Screen_Width + 1];
338
339 int Sign;
340 long int Delta;
341 long int Total, Errors, Remain;
342 long int Pos;
343
344 long int sR, eR, sQ, eQ;
345 long int Apos, Bpos;
346 long int SeqLenR, SeqLenQ;
347 int frameR, frameQ;
348
349 for ( i = 0; i < LINE_PREFIX_LEN; i ++ )
350 Buff3[i] = ' ';
351
352 SeqLenR = strlen (R + 1);
353 SeqLenQ = strlen (Q + 1);
354
355 if ( DATA_TYPE == NUCMER_DATA )
356 {
357 A[1] = R;
358 A[4] = (char *) Safe_malloc ( sizeof(char) * (SeqLenR + 2) );
359 strcpy ( A[4] + 1, A[1] + 1 );
360 A[4][0] = '\0';
361 Reverse_Complement ( A[4], 1, SeqLenR );
362
363 B[1] = Q;
364 B[4] = (char *) Safe_malloc ( sizeof(char) * (SeqLenQ + 2) );
365 strcpy ( B[4] + 1, B[1] + 1 );
366 B[4][0] = '\0';
367 Reverse_Complement ( B[4], 1, SeqLenQ );
368 }
369
370 for ( Ap = Aligns.begin( ); Ap < Aligns.end( ); Ap ++ )
371 {
372 sR = Ap->sR;
373 eR = Ap->eR;
374 sQ = Ap->sQ;
375 eQ = Ap->eQ;
376
377 //-- Get the coords and frame right
378 frameR = 1;
379 if ( sR > eR )
380 {
381 sR = revC (sR, SeqLenR);
382 eR = revC (eR, SeqLenR);
383 frameR += 3;
384 }
385 frameQ = 1;
386 if ( sQ > eQ )
387 {
388 sQ = revC (sQ, SeqLenQ);
389 eQ = revC (eQ, SeqLenQ);
390 frameQ += 3;
391 }
392
393 if ( DATA_TYPE == PROMER_DATA )
394 {
395 frameR += (sR + 2) % 3;
396 frameQ += (sQ + 2) % 3;
397
398 //-- Translate the coordinates from DNA to Amino Acid
399 // remeber that eR and eQ point to the last nucleotide in the codon
400 sR = (sR + 2) / 3;
401 eR = eR / 3;
402 sQ = (sQ + 2) / 3;
403 eQ = eQ / 3;
404 }
405 Ai = frameR;
406 Bi = frameQ;
407 if ( frameR > 3 )
408 frameR = -(frameR - 3);
409 if ( frameQ > 3 )
410 frameQ = -(frameQ - 3);
411
412 //-- Get the sequence
413 if ( A[Ai] == NULL )
414 {
415 assert ( DATA_TYPE == PROMER_DATA );
416 A[Ai] = (char *) Safe_malloc ( sizeof(char) * ( SeqLenR / 3 + 2 ) );
417 A[Ai][0] = '\0';
418 Translate_DNA ( R, A[Ai], Ai );
419 }
420 if ( B[Bi] == NULL )
421 {
422 assert ( DATA_TYPE == PROMER_DATA );
423 B[Bi] = (char *) Safe_malloc ( sizeof(char) * ( SeqLenQ / 3 + 2 ) );
424 B[Bi][0] = '\0';
425 Translate_DNA ( Q, B[Bi], Bi );
426 }
427
428
429 //-- Generate the alignment
430 printf("-- BEGIN alignment [ %s%d %ld - %ld | %s%d %ld - %ld ]\n\n\n",
431 frameR > 0 ? "+" : "-", abs(frameR), Ap->sR, Ap->eR,
432 frameQ > 0 ? "+" : "-", abs(frameQ), Ap->sQ, Ap->eQ);
433
434 Apos = sR;
435 Bpos = sQ;
436
437 Errors = 0;
438 Total = 0;
439 Remain = eR - sR + 1;
440
441 sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR));
442 sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ));
443 Pos = LINE_PREFIX_LEN;
444
445 for ( Dp = Ap->Delta.begin( );
446 Dp < Ap->Delta.end( ) &&
447 *Dp != 0; Dp ++ )
448 {
449 Delta = *Dp;
450 Sign = Delta > 0 ? 1 : -1;
451 Delta = labs ( Delta );
452
453
454 //-- For all the bases before the next indel
455 for ( i = 1; i < Delta; i ++ )
456 {
457 if ( Pos >= Screen_Width )
458 {
459 Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0';
460 if ( DATA_TYPE == NUCMER_DATA )
461 printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3);
462 else
463 printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2);
464 sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR));
465 sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ));
466 Pos = LINE_PREFIX_LEN;
467 }
468
469 if ( DATA_TYPE == NUCMER_DATA )
470 Buff3[Pos] = A[Ai][Apos] == B[Bi][Bpos] ?
471 NUCMER_MATCH_CHAR : NUCMER_MISMATCH_CHAR;
472 else if ( A[Ai][Apos] == B[Bi][Bpos] )
473 Buff3[Pos] = A[Ai][Apos];
474 else
475 Buff3[Pos] = MATCH_SCORE
476 [MATRIX_TYPE]
477 [toupper(A[Ai][Apos]) - 'A']
478 [toupper(B[Bi][Bpos]) - 'A'] > 0 ?
479 PROMER_SIM_CHAR : PROMER_MISMATCH_CHAR;
480 Buff1[Pos] = A[Ai][Apos ++];
481 Buff2[Pos ++] = B[Bi][Bpos ++];
482 }
483
484
485 //-- For the indel
486 Remain -= i - 1;
487
488 if ( Pos >= Screen_Width )
489 {
490 Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0';
491 if ( DATA_TYPE == NUCMER_DATA )
492 printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3);
493 else
494 printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2);
495 sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR));
496 sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ));
497 Pos = LINE_PREFIX_LEN;
498 }
499
500 if ( Sign == 1 )
501 {
502 if ( DATA_TYPE == NUCMER_DATA )
503 Buff3[Pos] = NUCMER_MISMATCH_CHAR;
504 else
505 Buff3[Pos] = PROMER_MISMATCH_CHAR;
506 Buff1[Pos] = A[Ai][Apos ++];
507 Buff2[Pos ++] = '.';
508 Remain --;
509 }
510 else
511 {
512 if ( DATA_TYPE == NUCMER_DATA )
513 Buff3[Pos] = NUCMER_MISMATCH_CHAR;
514 else
515 Buff3[Pos] = PROMER_MISMATCH_CHAR;
516 Buff1[Pos] = '.';
517 Buff2[Pos ++] = B[Bi][Bpos ++];
518 Total ++;
519 }
520 }
521
522
523 //-- For all the bases remaining after the last indel
524 for ( i = 0; i < Remain; i ++ )
525 {
526 if ( Pos >= Screen_Width )
527 {
528 Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0';
529 if ( DATA_TYPE == NUCMER_DATA )
530 printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3);
531 else
532 printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2);
533 sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR));
534 sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ));
535 Pos = LINE_PREFIX_LEN;
536 }
537
538 if ( DATA_TYPE == NUCMER_DATA )
539 Buff3[Pos] = A[Ai][Apos] == B[Bi][Bpos] ?
540 NUCMER_MATCH_CHAR : NUCMER_MISMATCH_CHAR;
541 else if ( A[Ai][Apos] == B[Bi][Bpos] )
542 Buff3[Pos] = A[Ai][Apos];
543 else
544 Buff3[Pos] = MATCH_SCORE
545 [MATRIX_TYPE]
546 [toupper(A[Ai][Apos]) - 'A']
547 [toupper(B[Bi][Bpos]) - 'A'] > 0 ?
548 PROMER_SIM_CHAR : PROMER_MISMATCH_CHAR;
549 Buff1[Pos] = A[Ai][Apos ++];
550 Buff2[Pos ++] = B[Bi][Bpos ++];
551 }
552
553
554 //-- For the remaining buffered output
555 if ( Pos > LINE_PREFIX_LEN )
556 {
557 Buff1[Pos] = Buff2[Pos] = Buff3[Pos] = '\0';
558 if ( DATA_TYPE == NUCMER_DATA )
559 printf("%s\n%s\n%s\n\n", Buff1, Buff2, Buff3);
560 else
561 printf("%s\n%s\n%s\n\n", Buff1, Buff3, Buff2);
562 sprintf(Buff1, PREFIX_FORMAT, toFwd (Apos, SeqLenR, frameR));
563 sprintf(Buff2, PREFIX_FORMAT, toFwd (Bpos, SeqLenQ, frameQ));
564 Pos = LINE_PREFIX_LEN;
565 }
566
567 printf("\n-- END alignment [ %s%d %ld - %ld | %s%d %ld - %ld ]\n",
568 frameR > 0 ? "+" : "-", abs(frameR), Ap->sR, Ap->eR,
569 frameQ > 0 ? "+" : "-", abs(frameQ), Ap->sQ, Ap->eQ);
570 }
571
572 //-- Free the sequences, except for the originals
573 for ( i = 0; i < 7; i ++ )
574 {
575 if ( (DATA_TYPE != NUCMER_DATA || i != 1) && A[i] != NULL )
576 free ( A[i] );
577 if ( (DATA_TYPE != NUCMER_DATA || i != 1) && B[i] != NULL )
578 free ( B[i] );
579 }
580
581 return;
582 }
583
584
585
586
587 void printHelp
588 (const char * s)
589
590 // Display the program's help information to stderr
591
592 {
593 fprintf (stderr,
594 "\nUSAGE: %s [options] <deltafile> <ref ID> <qry ID>\n\n", s);
595 fprintf (stderr,
596 "-h Display help information\n"
597 "-q Sort alignments by the query start coordinate\n"
598 "-r Sort alignments by the reference start coordinate\n"
599 "-w int Set the screen width - default is 60\n"
600 "-x int Set the matrix type - default is 2 (BLOSUM 62),\n"
601 " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n"
602 " note: only has effect on amino acid alignments\n\n");
603 fprintf (stderr,
604 " Input is the .delta output of either the \"nucmer\" or the\n"
605 "\"promer\" program passed on the command line.\n"
606 " Output is to stdout, and consists of all the alignments between the\n"
607 "query and reference sequences identified on the command line.\n"
608 " NOTE: No sorting is done by default, therefore the alignments\n"
609 "will be ordered as found in the <deltafile> input.\n\n");
610 return;
611 }
612
613
614
615
616 void printUsage
617 (const char * s)
618
619 // Display the program's usage information to stderr.
620
621 {
622 fprintf (stderr,
623 "\nUSAGE: %s [options] <deltafile> <ref ID> <qry ID>\n\n", s);
624 fprintf (stderr, "Try '%s -h' for more information.\n", s);
625 return;
626 }
627
628
629
630
631 long int revC
632 (long int coord, long int len)
633 {
634 return len - coord + 1;
635 }