comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postnuc.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 //------------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
3 // File: postnuc.cc
4 // Date: 07 / 16 / 2002
5 //
6 // Revision: 08 / 01 / 2002
7 // Added MUM extension functionality
8 //
9 // Purpose: To translate the coordinates referencing the concatenated
10 // reference sequences back to the individual sequences and deal
11 // with any conflict that may have arisen (i.e. a MUM that spans
12 // the boundry between two sequences). Then to extend each cluster
13 // via Smith-Waterman techniques to expand the alignment coverage.
14 // Alignments which encounter each other will be fused to form one
15 // encompasing alignment when appropriate.
16 //
17 // Input: Input is the output of the .mgaps program from stdin. On the
18 // command line, the file names of the two original sequence files
19 // should come first, followed by the prefix <pfx> that should be
20 // placed in front of the two output filenames <pfx>.cluster and
21 // <pfx>.delta
22 //
23 // NOTE: Cluster file is now suppressed by default (see -d option).
24 //
25 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
26 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
27 // However, the clusters now reference their corresponding
28 // sequence and are all listed under headers that specify this
29 // sequence. The <pfx>.delta file is the alignment object that
30 // contains all the information necessary to reproduce the alignments
31 // generated by the MUM extension process. Please refer to the
32 // output file documentation for an in-depth description of these
33 // file formats.
34 //
35 // Usage: postnuc <reference> <query> <pfx> < <input>
36 // Where <reference> and <query> are the original input sequences of
37 // NUCmer and <pfx> is the prefix that should be added to the
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
39 //
40 //------------------------------------------------------------------------------
41
42 //-- NOTE: this option will significantly hamper program performance,
43 // mostly the alignment extension performance (sw_align.h)
44 //#define _DEBUG_ASSERT // self testing assert functions
45
46 #include "tigrinc.hh"
47 #include "sw_align.hh"
48 #include <vector>
49 #include <algorithm>
50 using namespace std;
51
52
53 //------------------------------------------------------ Globals -------------//
54 const signed char FORWARD_CHAR = 1;
55 const signed char REVERSE_CHAR = -1;
56
57
58 bool DO_DELTA = true;
59 bool DO_EXTEND = true;
60 bool TO_SEQEND = false;
61 bool DO_SHADOWS = false;
62
63
64 //------------------------------------------------------ Type Definitions ----//
65 enum LineType
66 //-- The type of input line from <stdin>
67 {
68 NO_LINE, HEADER_LINE, MATCH_LINE
69 };
70
71
72 struct FastaRecord
73 //-- The essential data of a sequence
74 {
75 char * Id; // the fasta ID header tag
76 long int len; // the length of the sequence
77 char * seq; // the sequence data
78 };
79
80
81 struct Match
82 //-- An exact match between two sequences A and B
83 {
84 long int sA, sB, len; // start coordinate in A, in B and the length
85 };
86
87
88 struct Cluster
89 //-- An ordered list of matches between two sequences A and B
90 {
91 bool wasFused; // have the cluster matches been fused yet?
92 signed char dirB; // the query sequence direction
93 // FORWARD_CHAR or REVERSE_CHAR
94 vector<Match> matches; // the ordered set of matches in the cluster
95 };
96
97
98 struct Synteny
99 //-- An ordered list of clusters between two sequences A and B
100 {
101 FastaRecord * AfP; // a pointer to the reference sequence record
102 FastaRecord Bf; // the query sequence record (w/o the sequence)
103 vector<Cluster> clusters; // the ordered set of clusters between A and B
104 };
105
106
107 struct Alignment
108 //-- An alignment object between two sequences A and B
109 {
110 signed char dirB; // the query sequence direction
111 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
112 vector<long int> delta; // the delta values, with NO zero at the end
113 long int deltaApos; // sum of abs(deltas) - #of negative deltas
114 // trust me, it is a very helpful value
115 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
116 };
117
118
119 struct AscendingClusterSort
120 //-- For sorting clusters in ascending order of their sA coordinate
121 {
122 bool operator() (const Cluster & pA, const Cluster & pB)
123 {
124 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
125 }
126 };
127
128
129
130
131 //------------------------------------------------- Function Declarations ----//
132 void addNewAlignment
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
134 vector<Match>::iterator Mp);
135
136 bool extendBackward
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
139
140 bool extendForward
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
142 const char * B, long int targetB, unsigned int m_o);
143
144 void extendClusters
145 (vector<Cluster> & Clusters,
146 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile);
147
148 void flushAlignments
149 (vector<Alignment> & Alignments,
150 const FastaRecord * Af, const FastaRecord * Bf,
151 FILE * DeltaFile);
152
153 void flushSyntenys
154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
155
156 vector<Cluster>::iterator getForwardTargetCluster
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
158 long int & targetA, long int & targetB);
159
160 vector<Alignment>::iterator getReverseTargetAlignment
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
162
163 bool isShadowedCluster
164 (vector<Cluster>::iterator CurrCp,
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
166
167 void parseAbort
168 (const char *);
169
170 void parseDelta
171 (vector<Alignment> & Alignments,
172 const FastaRecord * Af, const FastaRecord *Bf);
173
174 void processSyntenys
175 (vector<Synteny> & Syntenys,
176 FastaRecord * Af, long int As,
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
178
179 inline long int revC
180 (long int Coord, long int Len);
181
182 void printHelp
183 (const char *);
184
185 void printUsage
186 (const char *);
187
188 void validateData
189 (vector<Alignment> Alignments, vector<Cluster> Clusters,
190 const FastaRecord * Af, const FastaRecord * Bf);
191
192
193
194
195 //-------------------------------------------------- Function Definitions ----//
196 int main
197 (int argc, char ** argv)
198 {
199 FastaRecord * Af; // array of all the reference sequences
200
201 vector<Synteny> Syntenys; // vector of all sets of clusters
202 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
203 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
204
205 Synteny Asyn; // a single set of clusters
206 Cluster Aclu; // a single cluster of matches
207 Match Amat; // a single match
208
209 LineType PrevLine; // the current input line
210
211 bool Found; // temporary search flag
212
213 char Line [MAX_LINE]; // a single line of input
214 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
215 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
216 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
217
218 signed char DirB = FORWARD_CHAR; // the current query strand direction
219
220 long int i; // temporary counter
221 long int Seqi; // current reference sequence index
222 long int InitSize; // initial sequence memory space
223 long int As = 0; // the size of the reference array
224 long int Ac = 100; // the capacity of the reference array
225 long int sA, sB, len; // current match start in A, B and length
226
227 FILE * RefFile, * QryFile; // reference and query input files
228 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
229
230 //-- Set the alignment data type and break length (sw_align.h)
231 setMatrixType ( NUCLEOTIDE );
232 setBreakLen ( 200 );
233 setBanding ( 0 );
234
235 //-- Parse the command line arguments
236 {
237 optarg = NULL;
238 int ch, errflg = 0;
239 while ( !errflg && ((ch = getopt (argc, argv, "dehB:b:st")) != EOF) )
240 switch (ch)
241 {
242 case 'b' :
243 setBreakLen( atoi (optarg) );
244 break;
245
246 case 'B' :
247 setBanding( atoi (optarg) );
248 break;
249
250 case 'd' :
251 DO_DELTA = false;
252 break;
253
254 case 'e' :
255 DO_EXTEND = false;
256 break;
257
258 case 'h' :
259 printHelp (argv[0]);
260 exit (EXIT_SUCCESS);
261 break;
262
263 case 's' :
264 DO_SHADOWS = true;
265 break;
266
267 case 't' :
268 TO_SEQEND = true;
269 break;
270
271 default :
272 errflg ++;
273 }
274
275 if ( errflg > 0 || argc - optind != 3 )
276 {
277 printUsage (argv[0]);
278 exit (EXIT_FAILURE);
279 }
280 }
281
282 //-- Read and create the I/O file names
283 strcpy (RefFileName, argv[optind ++]);
284 strcpy (QryFileName, argv[optind ++]);
285 strcpy (ClusterFileName, argv[optind ++]);
286 strcpy (DeltaFileName, ClusterFileName);
287 strcat (ClusterFileName, ".cluster");
288 strcat (DeltaFileName, ".delta");
289
290 //-- Open all the files
291 RefFile = File_Open (RefFileName, "r");
292 QryFile = File_Open (QryFileName, "r");
293 if ( DO_DELTA )
294 {
295 ClusterFile = NULL;
296 DeltaFile = File_Open (DeltaFileName, "w");
297 }
298 else
299 {
300 ClusterFile = File_Open (ClusterFileName, "w");
301 DeltaFile = NULL;
302 }
303
304 //-- Print the headers of the output files
305 if ( DO_DELTA )
306 fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
307 else
308 fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
309
310 //-- Generate the array of the reference sequences
311 InitSize = INIT_SIZE;
312 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
313 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
314 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
315 {
316 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
317 strcpy (Af[As].Id, IdA);
318
319 Af[As].len = strlen (Af[As].seq + 1);
320
321 if ( ++ As >= Ac )
322 {
323 Ac *= 2;
324 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
325 }
326
327 InitSize = INIT_SIZE;
328 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
329 }
330 fclose (RefFile);
331 if ( As <= 0 )
332 parseAbort ( RefFileName );
333
334
335
336 //-- Process the input from <stdin> line by line
337 PrevLine = NO_LINE;
338 IdA[0] = '\0';
339 IdB[0] = '\0';
340 CurrIdB[0] = '\0';
341 while ( fgets(Line, MAX_LINE, stdin) != NULL )
342 {
343
344 //-- If the current line is a fasta HEADER_LINE
345 if ( Line[0] == '>' )
346 {
347 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
348 parseAbort ("stdin");
349 DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR;
350
351 PrevLine = HEADER_LINE;
352 }
353
354
355 //-- If the current line is a cluster HEADER_LINE
356 else if ( Line[0] == '#' )
357 {
358 PrevLine = HEADER_LINE;
359 }
360
361
362 //-- If the current line is a MATCH_LINE
363 else
364 {
365 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
366 parseAbort ("stdin");
367
368 //-- Re-map the reference coordinate back to its original sequence
369 for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ )
370 sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq
371 if ( Seqi >= As )
372 {
373 fprintf (stderr,
374 "ERROR: A MUM was found with a start coordinate greater than\n"
375 " the sequence length, a serious error has occured.\n"
376 " Please file a bug report\n");
377 exit (EXIT_FAILURE);
378 }
379
380 //-- If the match spans across a sequence boundry
381 if ( sA + len - 1 > Af[Seqi].len || sA <= 0)
382 {
383 fprintf (stderr,
384 "WARNING: A MUM was found extending beyond the boundry of:\n"
385 " Reference sequence '>%s'\n\n"
386 "Please check that the '-n' option is activated on 'mummer2'\n"
387 "and try again, or file a bug report\n"
388 "Attempting to continue.\n", Af[Seqi].Id);
389 continue;
390 }
391
392 //-- Check and update the current synteny region
393 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 )
394 {
395 Found = false;
396 if ( strcmp (IdB, CurrIdB) == 0 )
397 {
398 //-- Has this header been seen before?
399 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
400 {
401 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
402 {
403 if ( Sp->AfP->len != Af[Seqi].len )
404 {
405 fprintf (stderr,
406 "ERROR: The reference file may contain"
407 " sequences with non-unique\n"
408 " header Ids, please check your input"
409 " files and try again\n");
410 exit (EXIT_FAILURE);
411 }
412 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
413 CurrSp = Sp;
414 Found = true;
415 break;
416 }
417 }
418 }
419 else
420 {
421 //-- New B sequence header, process all the old synteny's
422 processSyntenys (Syntenys, Af, As,
423 QryFile, ClusterFile, DeltaFile);
424 }
425
426 strcpy (IdA, Af[Seqi].Id);
427 strcpy (IdB, CurrIdB);
428
429 //-- If not seen yet, create a new synteny region
430 if ( ! Found )
431 {
432 Asyn.AfP = Af + Seqi;
433 Asyn.Bf.len = -1;
434 Asyn.Bf.Id = (char *) Safe_malloc
435 (sizeof(char) * (strlen(IdB) + 1));
436 strcpy (Asyn.Bf.Id, IdB);
437
438 Syntenys.push_back (Asyn);
439 CurrSp = Syntenys.rbegin( );
440 }
441
442 //-- Add a new cluster to the current synteny
443 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
444 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
445 CurrSp->clusters.pop_back( ); // hack to remove empties
446 Aclu.wasFused = false;
447 Aclu.dirB = DirB;
448 CurrSp->clusters.push_back (Aclu);
449 }
450 else if ( PrevLine == HEADER_LINE )
451 {
452 //-- Add a new cluster to the current synteny
453 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
454 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
455 CurrSp->clusters.pop_back( );
456 Aclu.wasFused = false;
457 Aclu.dirB = DirB;
458 CurrSp->clusters.push_back (Aclu);
459 }
460
461 //-- Add a new match to the current cluster
462 if ( len > 1 )
463 {
464 Amat.sA = sA;
465 Amat.sB = sB;
466 Amat.len = len;
467 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
468 }
469
470 PrevLine = MATCH_LINE;
471 }
472 }
473
474 //-- Process the left-over syntenys
475 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
476 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
477 CurrSp->clusters.pop_back( );
478
479 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
480 fclose (QryFile);
481
482 //-- Free the reference sequences
483 for ( i = 0; i < As; i ++ )
484 {
485 free (Af[i].Id);
486 free (Af[i].seq);
487 }
488 free (Af);
489
490 return EXIT_SUCCESS;
491 }
492
493
494
495
496 void addNewAlignment
497 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
498 vector<Match>::iterator Mp)
499
500 // Create and add a new alignment object based on the current match
501 // and cluster information pointed to by Cp and Mp.
502
503 {
504 Alignment Align;
505
506 //-- Initialize the data
507 Align.sA = Mp->sA;
508 Align.sB = Mp->sB;
509 Align.eA = Mp->sA + Mp->len - 1;
510 Align.eB = Mp->sB + Mp->len - 1;
511 Align.dirB = Cp->dirB;
512 Align.delta.clear( );
513 Align.deltaApos = 0;
514
515 //-- Add the alignment object
516 Alignments.push_back (Align);
517
518 return;
519 }
520
521
522
523
524 bool extendBackward
525 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
526 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
527
528 // Extend an alignment backwards off of the current alignment object.
529 // The current alignment object must be freshly created and consist
530 // only of an exact match (i.e. the delta vector MUST be empty).
531 // If the TargetAp alignment object is reached by the extension, it will
532 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
533 // NULL the function will extend as far as possible. It is a strange
534 // and dangerous function because it can delete CurrAp, so edit with
535 // caution. Returns true if TargetAp was reached and merged, else false
536 // Designed only as a subroutine for extendClusters, should be used
537 // nowhere else.
538
539 {
540 bool target_reached = false;
541 bool overflow_flag = false;
542 bool double_flag = false;
543
544 vector<long int>::iterator Dp;
545
546 unsigned int m_o;
547 long int targetA, targetB;
548
549 m_o = BACKWARD_SEARCH;
550
551 //-- Set the target coordinates
552 if ( TargetAp != Alignments.end( ) )
553 {
554 targetA = TargetAp->eA;
555 targetB = TargetAp->eB;
556 }
557 else
558 {
559 targetA = 1;
560 targetB = 1;
561 m_o |= OPTIMAL_BIT;
562 }
563
564 //-- If alignment is too long, bring with bounds and set overflow_flag true
565 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
566 {
567 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
568 overflow_flag = true;
569 m_o |= OPTIMAL_BIT;
570 }
571 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
572 {
573 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
574 if ( overflow_flag )
575 double_flag = true;
576 else
577 overflow_flag = true;
578 m_o |= OPTIMAL_BIT;
579 }
580
581
582 if ( TO_SEQEND && !double_flag )
583 m_o |= SEQEND_BIT;
584
585
586 //-- Attempt to reach the target
587 target_reached = alignSearch (A, CurrAp->sA, targetA,
588 B, CurrAp->sB, targetB, m_o);
589
590 if ( overflow_flag || TargetAp == Alignments.end( ) )
591 target_reached = false;
592
593 if ( target_reached )
594 {
595 //-- Merge the two alignment objects
596 extendForward (TargetAp, A, CurrAp->sA,
597 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
598 TargetAp->eA = CurrAp->eA;
599 TargetAp->eB = CurrAp->eB;
600 Alignments.pop_back( );
601 }
602 else
603 {
604 alignTarget (A, targetA, CurrAp->sA,
605 B, targetB, CurrAp->sB,
606 CurrAp->delta, FORCED_FORWARD_ALIGN);
607 CurrAp->sA = targetA;
608 CurrAp->sB = targetB;
609
610 //-- Update the deltaApos value for the alignment object
611 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
612 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
613 }
614
615 return target_reached;
616 }
617
618
619
620
621 bool extendForward
622 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
623 const char * B, long int targetB, unsigned int m_o)
624
625 // Extend an alignment forwards off the current alignment object until
626 // target or end of sequence is reached, and merge the delta values of the
627 // alignment object with the new delta values generated by the extension.
628 // Return true if the target was reached, else false
629
630 {
631 long int ValA;
632 long int ValB;
633 unsigned int Di;
634 bool target_reached;
635 bool overflow_flag = false;
636 bool double_flag = false;
637 vector<long int>::iterator Dp;
638
639 //-- Set Di to the end of the delta vector
640 Di = CurrAp->delta.size( );
641
642 //-- Calculate the distance between the start and end positions
643 ValA = targetA - CurrAp->eA + 1;
644 ValB = targetB - CurrAp->eB + 1;
645
646 //-- If the distance is too long, shrink it and set the overflow_flag
647 if ( ValA > MAX_ALIGNMENT_LENGTH )
648 {
649 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
650 overflow_flag = true;
651 m_o |= OPTIMAL_BIT;
652 }
653 if ( ValB > MAX_ALIGNMENT_LENGTH )
654 {
655 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
656 if ( overflow_flag )
657 double_flag = true;
658 else
659 overflow_flag = true;
660 m_o |= OPTIMAL_BIT;
661 }
662
663 if ( double_flag )
664 m_o &= ~SEQEND_BIT;
665
666 target_reached = alignTarget (A, CurrAp->eA, targetA,
667 B, CurrAp->eB, targetB,
668 CurrAp->delta, m_o);
669
670 //-- Notify user if alignment was chopped short
671 if ( target_reached && overflow_flag )
672 target_reached = false;
673
674 //-- Pick up delta where left off (Di) and merge with new deltas
675 if ( Di < CurrAp->delta.size( ) )
676 {
677 //-- Merge the deltas
678 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
679 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
680 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
681 {
682 fprintf(stderr,
683 "ERROR: failed to merge alignments at position %ld\n"
684 " Please file a bug report\n",
685 CurrAp->eA);
686 exit (EXIT_FAILURE);
687 }
688
689 //-- Update the deltaApos
690 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
691 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
692 }
693
694 //-- Set the alignment coordinates
695 CurrAp->eA = targetA;
696 CurrAp->eB = targetB;
697
698 return target_reached;
699 }
700
701
702
703
704 void extendClusters
705 (vector<Cluster> & Clusters,
706 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
707
708 // Connect all the matches in every cluster between sequences A and B.
709 // Also, extend alignments off of the front and back of each cluster to
710 // expand total alignment coverage. When these extensions encounter an
711 // adjacent cluster, fuse the two regions to create one single
712 // encompassing region. This routine will create alignment objects from
713 // these extensions and output the resulting delta information to the
714 // delta output file.
715
716 {
717 //-- Sort the clusters (ascending) by their start coordinate in sequence A
718 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
719
720
721 //-- If no delta file is requested
722 if ( ! DO_DELTA )
723 return;
724
725
726 bool target_reached = false; // reached the adjacent match or cluster
727
728 char * A, * B; // the sequences A and B
729 char * Brev = NULL; // the reverse complement of B
730
731 unsigned int m_o;
732 long int targetA, targetB; // alignment extension targets in A and B
733
734 vector<Match>::iterator Mp; // match pointer
735
736 vector<Cluster>::iterator PrevCp; // where the extensions last left off
737 vector<Cluster>::iterator CurrCp; // the current cluster being extended
738 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
739
740 vector<Alignment> Alignments; // the vector of alignment objects
741 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
742 vector<Alignment>::iterator TargetAp; // target align
743
744
745 //-- Extend each cluster
746 A = Af->seq;
747 PrevCp = Clusters.begin( );
748 CurrCp = Clusters.begin( );
749 while ( CurrCp < Clusters.end( ) )
750 {
751 if ( DO_EXTEND )
752 {
753 //-- Ignore if shadowed or already extended
754 if ( ! target_reached )
755 if ( CurrCp->wasFused
756 ||
757 (!DO_SHADOWS &&
758 isShadowedCluster (CurrCp, Alignments, CurrAp)) )
759 {
760 CurrCp->wasFused = true;
761 CurrCp = ++ PrevCp;
762 continue;
763 }
764 }
765
766 //-- Pick the right directional sequence for B
767 if ( CurrCp->dirB == FORWARD_CHAR )
768 B = Bf->seq;
769 else if ( Brev != NULL )
770 B = Brev;
771 else
772 {
773 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
774 strcpy ( Brev + 1, Bf->seq + 1 );
775 Brev[0] = '\0';
776 Reverse_Complement (Brev, 1, Bf->len);
777 B = Brev;
778 }
779
780 //-- Extend each match in the cluster
781 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
782 {
783 //-- Try to extend the current match backwards
784 if ( target_reached )
785 {
786 //-- Merge with the previous match
787 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
788 {
789 if ( Mp >= CurrCp->matches.end( ) - 1 )
790 {
791 fprintf (stderr,
792 "ERROR: Target match does not exist, please\n"
793 " file a bug report\n");
794 exit (EXIT_FAILURE);
795 }
796 continue;
797 }
798 CurrAp->eA += Mp->len - 1;
799 CurrAp->eB += Mp->len - 1;
800 }
801 else
802 {
803 //-- Create a new alignment object
804 addNewAlignment (Alignments, CurrCp, Mp);
805 CurrAp = Alignments.end( ) - 1;
806
807 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
808 {
809 //-- Target the closest/best alignment object
810 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
811
812 //-- Extend the new alignment object backwards
813 if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) )
814 CurrAp = TargetAp;
815 }
816 }
817
818 m_o = FORWARD_ALIGN;
819
820 //-- Try to extend the current match forwards
821 if ( Mp < CurrCp->matches.end( ) - 1 )
822 {
823 //-- Target the next match in the cluster
824 targetA = (Mp + 1)->sA;
825 targetB = (Mp + 1)->sB;
826
827 //-- Extend the current alignment object forward
828 target_reached = extendForward (CurrAp, A, targetA,
829 B, targetB, m_o);
830 }
831 else if ( DO_EXTEND )
832 {
833 targetA = Af->len;
834 targetB = Bf->len;
835
836 //-- Target the closest/best match in a future cluster
837 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
838 targetA, targetB);
839 if ( TargetCp == Clusters.end( ) )
840 {
841 m_o |= OPTIMAL_BIT;
842 if ( TO_SEQEND )
843 m_o |= SEQEND_BIT;
844 }
845
846 //-- Extend the current alignment object forward
847 target_reached = extendForward (CurrAp, A, targetA,
848 B, targetB, m_o);
849 }
850 }
851 if ( TargetCp == Clusters.end( ) )
852 target_reached = false;
853
854 CurrCp->wasFused = true;
855
856 if ( target_reached == false )
857 CurrCp = ++ PrevCp;
858 else
859 CurrCp = TargetCp;
860 }
861
862 #ifdef _DEBUG_ASSERT
863 validateData (Alignments, Clusters, Af, Bf);
864 #endif
865
866 //-- Output the alignment data to the delta file
867 flushAlignments (Alignments, Af, Bf, DeltaFile);
868
869 if ( Brev != NULL )
870 free (Brev);
871
872 return;
873 }
874
875
876
877
878 void flushAlignments
879 (vector<Alignment> & Alignments,
880 const FastaRecord * Af, const FastaRecord * Bf,
881 FILE * DeltaFile)
882
883 // Simply output the delta information stored in Alignments to the
884 // given delta file. Free the memory used by Alignments once the
885 // data is successfully output to the file.
886
887 {
888 vector<Alignment>::iterator Ap; // alignment pointer
889 vector<long int>::iterator Dp; // delta pointer
890
891 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
892
893 //-- Generate the error counts
894 parseDelta (Alignments, Af, Bf);
895
896 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
897 {
898 if ( Ap->dirB == FORWARD_CHAR )
899 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
900 Ap->sA, Ap->eA,
901 Ap->sB, Ap->eB,
902 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
903 else
904 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
905 Ap->sA, Ap->eA,
906 revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len),
907 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
908
909 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
910 fprintf (DeltaFile, "%ld\n", *Dp);
911 fprintf (DeltaFile, "0\n");
912
913 Ap->delta.clear( );
914 }
915
916 Alignments.clear( );
917
918 return;
919 }
920
921
922
923
924 void flushSyntenys
925 (vector<Synteny> & Syntenys, FILE * ClusterFile)
926
927 // Simply output the synteny/cluster information generated by the mgaps
928 // program. However, now the coordinates reference their appropriate
929 // reference sequence, and the reference sequecne header is added to
930 // the appropriate lines. Free the memory used by Syntenys once the
931 // data is successfully output to the file.
932
933 {
934 vector<Synteny>::iterator Sp; // synteny pointer
935 vector<Cluster>::iterator Cp; // cluster pointer
936 vector<Match>::iterator Mp; // match pointer
937
938 if ( ClusterFile )
939 {
940 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
941 {
942 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
943 Sp->AfP->len, Sp->Bf.len);
944 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
945 {
946 fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB);
947 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
948 {
949 if ( Cp->dirB == FORWARD_CHAR )
950 fprintf (ClusterFile, "%8ld %8ld %6ld",
951 Mp->sA, Mp->sB, Mp->len);
952 else
953 fprintf (ClusterFile, "%8ld %8ld %6ld",
954 Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len);
955 if ( Mp != Cp->matches.begin( ) )
956 fprintf (ClusterFile, "%6ld %6ld\n",
957 Mp->sA - (Mp - 1)->sA - (Mp - 1)->len,
958 Mp->sB - (Mp - 1)->sB - (Mp - 1)->len);
959 else
960 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
961 }
962 Cp->matches.clear( );
963 }
964 Sp->clusters.clear( );
965 }
966 }
967 else
968 {
969 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
970 {
971 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
972 {
973 Cp->matches.clear( );
974 }
975 Sp->clusters.clear( );
976 }
977 }
978
979 Syntenys.clear( );
980
981 return;
982 }
983
984
985
986
987 vector<Cluster>::iterator getForwardTargetCluster
988 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
989 long int & targetA, long int & targetB)
990
991 // Return the cluster that is most likely to successfully join (in a
992 // forward direction) with the current cluster. The returned cluster
993 // must contain 1 or more matches that are strictly greater than the end
994 // of the current cluster. The targeted cluster must also be on a
995 // diagonal close enough to the current cluster, so that a connection
996 // could possibly be made by the alignment extender. Assumes clusters
997 // have been sorted via AscendingClusterSort. Returns targeted cluster
998 // and stores the target coordinates in targetA and targetB. If no
999 // suitable cluster was found, the function will return NULL and target
1000 // A and targetB will remain unchanged.
1001
1002 {
1003 vector<Match>::iterator Mip; // match iteratrive pointer
1004 vector<Cluster>::iterator Cp; // cluster pointer
1005 vector<Cluster>::iterator Cip; // cluster iterative pointer
1006 long int eA, eB; // possible target
1007 long int greater, lesser; // gap sizes between two clusters
1008 long int sA = CurrCp->matches.rbegin( )->sA +
1009 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
1010 long int sB = CurrCp->matches.rbegin( )->sB +
1011 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
1012
1013 //-- End of sequences is the default target, set distance accordingly
1014 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
1015
1016 //-- For all clusters greater than the current cluster (on sequence A)
1017 Cp = Clusters.end( );
1018 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
1019 {
1020 //-- If the cluster is on the same direction
1021 if ( CurrCp->dirB == Cip->dirB )
1022 {
1023 eA = Cip->matches.begin( )->sA;
1024 eB = Cip->matches.begin( )->sB;
1025
1026 //-- If the cluster overlaps the current cluster, strip some matches
1027 if ( ( eA < sA || eB < sB ) &&
1028 Cip->matches.rbegin( )->sA >= sA &&
1029 Cip->matches.rbegin( )->sB >= sB )
1030 {
1031 for ( Mip = Cip->matches.begin( );
1032 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
1033 Mip ++ )
1034 {
1035 eA = Mip->sA;
1036 eB = Mip->sB;
1037 }
1038 }
1039
1040 //-- If the cluster is strictly greater than current cluster
1041 if ( eA >= sA && eB >= sB )
1042 {
1043 if ( eA - sA > eB - sB )
1044 {
1045 greater = eA - sA;
1046 lesser = eB - sB;
1047 }
1048 else
1049 {
1050 lesser = eA - sA;
1051 greater = eB - sB;
1052 }
1053
1054 //-- If the cluster is close enough
1055 if ( greater < getBreakLen( ) ||
1056 (lesser) * GOOD_SCORE [getMatrixType( )] +
1057 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
1058 {
1059 Cp = Cip;
1060 targetA = eA;
1061 targetB = eB;
1062 break;
1063 }
1064 else if ( (greater << 1) - lesser < dist )
1065 {
1066 Cp = Cip;
1067 targetA = eA;
1068 targetB = eB;
1069 dist = (greater << 1) - lesser;
1070 }
1071 }
1072 }
1073 }
1074
1075
1076 return Cp;
1077 }
1078
1079
1080
1081
1082 vector<Alignment>::iterator getReverseTargetAlignment
1083 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
1084
1085 // Return the alignment that is most likely to successfully join (in a
1086 // reverse direction) with the current alignment. The returned alignment
1087 // must be strictly less than the current cluster and be on a diagonal
1088 // close enough to the current alignment, so that a connection
1089 // could possibly be made by the alignment extender. Assumes clusters
1090 // have been sorted via AscendingClusterSort and processed in order, so
1091 // therefore all alignments are in order by their start A coordinate.
1092
1093 {
1094 vector<Alignment>::iterator Ap; // alignment pointer
1095 vector<Alignment>::iterator Aip; // alignment iterative pointer
1096 long int eA, eB; // possible targets
1097 long int greater, lesser; // gap sizes between the two alignments
1098 long int sA = CurrAp->sA; // the startA of the current alignment
1099 long int sB = CurrAp->sB; // the startB of the current alignment
1100
1101 //-- Beginning of sequences is the default target, set distance accordingly
1102 long int dist = (sA < sB ? sA : sB);
1103
1104 //-- For all alignments less than the current alignment (on sequence A)
1105 Ap = Alignments.end( );
1106 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
1107 {
1108 //-- If the alignment is on the same direction
1109 if ( CurrAp->dirB == Aip->dirB )
1110 {
1111 eA = Aip->eA;
1112 eB = Aip->eB;
1113
1114 //-- If the alignment is strictly less than current cluster
1115 if ( eA <= sA && eB <= sB )
1116 {
1117 if ( sA - eA > sB - eB )
1118 {
1119 greater = sA - eA;
1120 lesser = sB - eB;
1121 }
1122 else
1123 {
1124 lesser = sA - eA;
1125 greater = sB - eB;
1126 }
1127
1128 //-- If the cluster is close enough
1129 if ( greater < getBreakLen( ) ||
1130 (lesser) * GOOD_SCORE [getMatrixType( )] +
1131 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
1132 {
1133 Ap = Aip;
1134 break;
1135 }
1136 else if ( (greater << 1) - lesser < dist )
1137 {
1138 Ap = Aip;
1139 dist = (greater << 1) - lesser;
1140 }
1141 }
1142 }
1143 }
1144
1145
1146 return Ap;
1147 }
1148
1149
1150
1151
1152 bool isShadowedCluster
1153 (vector<Cluster>::iterator CurrCp,
1154 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
1155
1156 // Check if the current cluster is shadowed by a previously produced
1157 // alignment region. Return true if it is, else false.
1158
1159 {
1160 vector<Alignment>::iterator Aip; // alignment pointer
1161
1162 long int sA = CurrCp->matches.begin( )->sA;
1163 long int eA = CurrCp->matches.rbegin( )->sA +
1164 CurrCp->matches.rbegin( )->len - 1;
1165 long int sB = CurrCp->matches.begin( )->sB;
1166 long int eB = CurrCp->matches.rbegin( )->sB +
1167 CurrCp->matches.rbegin( )->len - 1;
1168
1169 if ( ! Alignments.empty( ) ) // if there are alignments to use
1170 {
1171 //-- Look backwards in hope of finding a shadowing alignment
1172 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
1173 {
1174 //-- If in the same direction and shadowing the current cluster, break
1175 if ( Aip->dirB == CurrCp->dirB )
1176 if ( Aip->eA >= eA && Aip->eB >= eB )
1177 if ( Aip->sA <= sA && Aip->sB <= sB )
1178 break;
1179 }
1180
1181 //-- Return true if the loop was broken, i.e. shadow found
1182 if ( Aip >= Alignments.begin( ) )
1183 return true;
1184 }
1185
1186 //-- Return false if Alignments was empty or loop was not broken
1187 return false;
1188 }
1189
1190
1191
1192
1193 void parseAbort
1194 (const char * s)
1195
1196 // Abort the program if there was an error in parsing file 's'
1197
1198 {
1199 fprintf (stderr,
1200 "ERROR: Could not parse input from '%s'. \n"
1201 "Please check the filename and format, or file a bug report\n", s);
1202 exit (EXIT_FAILURE);
1203 }
1204
1205
1206
1207
1208 void parseDelta
1209 (vector<Alignment> & Alignments,
1210 const FastaRecord * Af, const FastaRecord *Bf)
1211
1212 // Use the delta information to generate the error counts for each
1213 // alignment, and fill this information into the data type
1214
1215 {
1216 char * A, * B, * Brev = NULL;
1217 char ch1, ch2;
1218 long int Delta;
1219 int Sign;
1220 long int i, Apos, Bpos;
1221 long int Remain, Total;
1222 long int Errors, SimErrors;
1223 long int NonAlphas;
1224 vector<Alignment>::iterator Ap;
1225 vector<long int>::iterator Dp;
1226
1227 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1228 {
1229 A = Af->seq;
1230 B = Bf->seq;
1231
1232 if ( Ap->dirB == REVERSE_CHAR )
1233 {
1234 if ( Brev == NULL )
1235 {
1236 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
1237 strcpy ( Brev + 1, Bf->seq + 1 );
1238 Brev[0] = '\0';
1239 Reverse_Complement (Brev, 1, Bf->len);
1240 B = Brev;
1241 }
1242 B = Brev;
1243 }
1244
1245 Apos = Ap->sA;
1246 Bpos = Ap->sB;
1247
1248 Errors = 0;
1249 SimErrors = 0;
1250 NonAlphas = 0;
1251 Remain = Ap->eA - Ap->sA + 1;
1252 Total = Remain;
1253
1254 //-- For all delta's in this alignment
1255 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
1256 {
1257 Delta = *Dp;
1258 Sign = Delta > 0 ? 1 : -1;
1259 Delta = labs ( Delta );
1260
1261 //-- For all the bases before the next indel
1262 for ( i = 1; i < Delta; i ++ )
1263 {
1264 ch1 = A [Apos ++];
1265 ch2 = B [Bpos ++];
1266
1267 if ( !isalpha (ch1) )
1268 {
1269 ch1 = STOP_CHAR;
1270 NonAlphas ++;
1271 }
1272 if ( !isalpha (ch2) )
1273 {
1274 ch2 = STOP_CHAR;
1275 NonAlphas ++;
1276 }
1277
1278 ch1 = toupper(ch1);
1279 ch2 = toupper(ch2);
1280 if ( 1 > MATCH_SCORE
1281 [getMatrixType( )]
1282 [ch1 - 'A']
1283 [ch2 - 'A'] )
1284 SimErrors ++;
1285 if ( ch1 != ch2 )
1286 Errors ++;
1287 }
1288
1289 //-- Process the current indel
1290 Remain -= i - 1;
1291 Errors ++;
1292 SimErrors ++;
1293
1294 if ( Sign == 1 )
1295 {
1296 Apos ++;
1297 Remain --;
1298 }
1299 else
1300 {
1301 Bpos ++;
1302 Total ++;
1303 }
1304 }
1305
1306 //-- For all the bases after the final indel
1307 for ( i = 0; i < Remain; i ++ )
1308 {
1309 //-- Score character match and update error counters
1310 ch1 = A [Apos ++];
1311 ch2 = B [Bpos ++];
1312
1313 if ( !isalpha (ch1) )
1314 {
1315 ch1 = STOP_CHAR;
1316 NonAlphas ++;
1317 }
1318 if ( !isalpha (ch2) )
1319 {
1320 ch2 = STOP_CHAR;
1321 NonAlphas ++;
1322 }
1323
1324 ch1 = toupper(ch1);
1325 ch2 = toupper(ch2);
1326 if ( 1 > MATCH_SCORE
1327 [getMatrixType( )]
1328 [ch1 - 'A']
1329 [ch2 - 'A'] )
1330 SimErrors ++;
1331 if ( ch1 != ch2 )
1332 Errors ++;
1333 }
1334
1335 Ap->Errors = Errors;
1336 Ap->SimErrors = SimErrors;
1337 Ap->NonAlphas = NonAlphas;
1338 }
1339
1340 if ( Brev != NULL )
1341 free ( Brev );
1342
1343 return;
1344 }
1345
1346
1347
1348
1349 void processSyntenys
1350 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
1351 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
1352
1353 // For each syntenic region with clusters, read in the B sequence and
1354 // extend the clusters to expand total alignment coverage. Only should
1355 // be called once all the clusters for the contained syntenic regions have
1356 // been stored in the data structure. Frees the memory used by the
1357 // the syntenic regions once the output of extendClusters and
1358 // flushSyntenys has been produced.
1359
1360 {
1361 FastaRecord Bf; // the current B sequence
1362 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
1363
1364 long int InitSize = INIT_SIZE; // the initial memory capacity of B
1365 vector<Synteny>::iterator CurrSp; // current synteny pointer
1366
1367 //-- Initialize the B sequence storage
1368 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
1369
1370 //-- For all the contained syntenys
1371 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
1372 {
1373 //-- If no clusters, ignore
1374 if ( CurrSp->clusters.empty( ) )
1375 continue;
1376
1377 //-- If a B sequence not seen yet, read it in
1378 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
1379 // ordered as output by mgaps, if they are not in order the program
1380 // will fail. (All like tags must be adjacent and in the same order
1381 // as the query file)
1382 if ( CurrSp == Syntenys.begin( ) ||
1383 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
1384 {
1385 //-- Read in the B sequence
1386 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
1387 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
1388 break;
1389 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
1390 parseAbort ( "Query File" );
1391 Bf.len = strlen (Bf.seq + 1);
1392 }
1393
1394 //-- Extend clusters and create the alignment information
1395 CurrSp->Bf.len = Bf.len;
1396 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
1397 }
1398
1399 //-- Create the cluster information
1400 flushSyntenys (Syntenys, ClusterFile);
1401
1402 free (Bf.Id);
1403 free (Bf.seq);
1404
1405 return;
1406 }
1407
1408
1409
1410
1411 inline long int revC
1412 (long int Coord, long int Len)
1413
1414 // Reverse complement the given coordinate for the given length.
1415
1416 {
1417 assert (Len - Coord + 1 > 0);
1418 return (Len - Coord + 1);
1419 }
1420
1421
1422
1423
1424 void printHelp
1425 (const char * s)
1426
1427 // Display the program's help information to stderr.
1428
1429 {
1430 fprintf (stderr,
1431 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
1432 fprintf (stderr,
1433 "-b int set the alignment break (give-up) length to int\n"
1434 "-B int set the diagonal banding for extension to int\n"
1435 "-d output only match clusters rather than extended alignments\n"
1436 "-e do not extend alignments outward from clusters\n"
1437 "-h display help information\n"
1438 "-s don't remove shadowed alignments, useful for aligning a\n"
1439 " sequence to itself to identify repeats\n"
1440 "-t force alignment to ends of sequence if within -b distance\n\n");
1441 fprintf (stderr,
1442 " Input is the output of the \"mgaps\" program from stdin, and\n"
1443 "the two original NUCmer sequence files passed on the command\n"
1444 "line. <pfx> is the prefix to be added to the front of the\n"
1445 "output file <pfx>.delta\n"
1446 " <pfx>.delta is the alignment object that catalogs the distance\n"
1447 "between insertions and deletions. For further information\n"
1448 "regarding this file, please refer to the documentation under\n"
1449 "the .delta output description.\n\n");
1450 return;
1451 }
1452
1453
1454
1455
1456 void printUsage
1457 (const char * s)
1458
1459 // Display the program's usage information to stderr.
1460
1461 {
1462 fprintf (stderr,
1463 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
1464 fprintf (stderr, "Try '%s -h' for more information.\n", s);
1465 return;
1466 }
1467
1468
1469
1470
1471 void validateData
1472 (vector<Alignment> Alignments, vector<Cluster> Clusters,
1473 const FastaRecord * Af, const FastaRecord * Bf)
1474
1475 // Self test function to check that the cluster and alignment information
1476 // is valid
1477
1478 {
1479 char * A, * B, * Brev = NULL;
1480 vector<Cluster>::iterator Cp;
1481 vector<Match>::iterator Mp;
1482 vector<Alignment>::iterator Ap;
1483 long int x, y, i;
1484 char Xc, Yc;
1485
1486 A = Af->seq;
1487
1488 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
1489 {
1490 assert ( Cp->wasFused );
1491
1492 //-- Pick the right directional sequence for B
1493 if ( Cp->dirB == FORWARD_CHAR )
1494 B = Bf->seq;
1495 else if ( Brev != NULL )
1496 B = Brev;
1497 else
1498 {
1499 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
1500 strcpy ( Brev + 1, Bf->seq + 1 );
1501 Brev[0] = '\0';
1502 Reverse_Complement (Brev, 1, Bf->len);
1503 B = Brev;
1504 }
1505
1506 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
1507 {
1508 //-- assert for each match in cluster, it is indeed a match
1509 x = Mp->sA;
1510 y = Mp->sB;
1511 for ( i = 0; i < Mp->len; i ++ )
1512 assert ( A[x ++] == B[y ++] );
1513
1514 //-- assert for each match in cluster, it is contained in an alignment
1515 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1516 {
1517 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
1518 Ap->eA >= Mp->sA + Mp->len - 1 &&
1519 Ap->eB >= Mp->sB + Mp->len - 1 )
1520 break;
1521 }
1522 assert ( Ap < Alignments.end( ) );
1523 }
1524 }
1525
1526 //-- assert alignments are optimal (quick check if first and last chars equal)
1527 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1528 {
1529 if ( Ap->dirB == REVERSE_CHAR )
1530 {
1531 assert (Brev != NULL);
1532 B = Brev;
1533 }
1534 else
1535 B = Bf->seq;
1536
1537 assert ( Ap->sA <= Ap->eA );
1538 assert ( Ap->sB <= Ap->eB );
1539
1540 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
1541 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
1542 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
1543 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
1544
1545 Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR);
1546 Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR);
1547 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
1548
1549 Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR);
1550 Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR);
1551 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
1552 }
1553
1554 if ( Brev != NULL )
1555 free (Brev);
1556
1557 return;
1558 }