comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 //------------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
3 // File: postpro.cc
4 // Date: 08 / 20 / 2002
5 //
6 // Purpose: To translate the coordinates referencing the concatenated
7 // reference sequences back to the individual sequences and deal
8 // with any conflict that may have arisen (i.e. a MUM that spans
9 // the boundry between two sequences). Then to extend each cluster
10 // via Smith-Waterman techniques to expand the alignment coverage.
11 // Alignments which encounter each other will be fused to form one
12 // encompasing alignment when appropriate.
13 //
14 // Input: Input is the output of the .mgaps program from stdin. On the
15 // command line, the file names of the two original sequence files
16 // should come first, followed by the prefix <pfx> that should be
17 // placed in front of the two output filenames <pfx>.cluster and
18 // <pfx>.delta
19 //
20 // NOTE: Cluster file is now suppressed by default (see -d option).
21 //
22 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
23 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
24 // However, the clusters now reference their corresponding
25 // sequence and are all listed under headers that specify this
26 // sequence. In addition, the coordinates now reference the original
27 // DNA input and not the amino acid translations. The <pfx>.delta file
28 // is the alignment object that contains all the information necessary
29 // to reproduce the alignments generated by the MUM extension process.
30 // The coordinates in this file reference the original DNA input, however
31 // the delta values represent amino acids, so 1 delta = 3 nucleotides.
32 // Please refer to the output file documentation for an in-depth
33 // description of these file formats.
34 //
35 // Usage: postpro <reference> <query> <pfx> < <input>
36 // Where <reference> and <query> are the original input sequences of
37 // PROmer and <pfx> is the prefix that should be added to the
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
39 //
40 //------------------------------------------------------------------------------
41
42 //-- NOTE: this option will significantly hamper program performance,
43 // mostly the alignment extension performance (sw_align.h)
44 //#define _DEBUG_ASSERT // self testing assert functions
45
46 #include "tigrinc.hh"
47 #include "sw_align.hh"
48 #include "translate.hh"
49 #include <vector>
50 #include <algorithm>
51 using namespace std;
52
53
54
55
56 //------------------------------------------------------ Globals -------------//
57 bool DO_DELTA = true;
58 bool DO_EXTEND = true;
59 bool TO_SEQEND = false;
60
61
62 //------------------------------------------------------ Type Definitions ----//
63 enum LineType
64 //-- The type of input line from <stdin>
65 {
66 NO_LINE, HEADER_LINE, MATCH_LINE
67 };
68
69
70 struct FastaRecord
71 //-- The essential data of a sequence
72 {
73 char * Id; // the fasta ID header tag
74 long int len; // the length of the sequence
75 char * seq; // the sequence data
76 };
77
78
79 struct Match
80 //-- An exact match between two sequences A and B
81 {
82 long int sA, sB, len; // start coordinate in A, in B and the length
83 };
84
85
86 struct Cluster
87 //-- An ordered list of matches between two sequences A and B
88 {
89 bool wasFused; // have the cluster matches been extended yet?
90 int frameA; // the reference sequence frame (1-6)
91 int frameB; // the query sequence frame
92 vector<Match> matches; // the ordered set of matches in the cluster
93 };
94
95
96 struct Synteny
97 //-- An ordered list of clusters between two sequences A and B
98 {
99 FastaRecord * AfP; // a pointer to the reference sequence record
100 FastaRecord Bf; // the query sequence record (w/o the sequence)
101 vector<Cluster> clusters; // the ordered set of clusters between A and B
102 };
103
104
105 struct Alignment
106 //-- An alignment object between two sequences A and B
107 {
108 int frameA; // the reference sequence frame (1-6)
109 int frameB; // the query sequence frame
110 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
111 vector<long int> delta; // the delta values, with NO zero at the end
112 long int deltaApos; // sum of abs(deltas) - #of negative deltas
113 // trust me, it is a very helpful value
114 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
115 };
116
117
118 struct AscendingClusterSort
119 //-- For sorting clusters in ascending order of their sA coordinate
120 {
121 bool operator() (const Cluster & pA, const Cluster & pB)
122 {
123 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
124 }
125 };
126
127
128
129
130
131 //------------------------------------------------- Function Declarations ----//
132 void addNewAlignment
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
134 vector<Match>::iterator Mp);
135
136 bool extendBackward
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
139
140 bool extendForward
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
142 const char * B, long int targetB, unsigned int m_o);
143
144 void extendClusters
145 (vector<Cluster> & Clusters,
146 const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile);
147
148 void flushAlignments
149 (vector<Alignment> & Alignments,
150 const FastaRecord * Af, const FastaRecord * Bf,
151 FILE * DeltaFile);
152
153 void flushSyntenys
154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
155
156 vector<Cluster>::iterator getForwardTargetCluster
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
158 long int & targetA, long int & targetB);
159
160 vector<Alignment>::iterator getReverseTargetAlignment
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
162
163 bool isShadowedCluster
164 (vector<Cluster>::iterator CurrCp,
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
166
167 void parseAbort
168 (const char *);
169
170 void parseDelta
171 (vector<Alignment> & Alignments,
172 const FastaRecord * Af, const FastaRecord *Bf);
173
174 void processSyntenys
175 (vector<Synteny> & Syntenys,
176 FastaRecord * Af, long int As,
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
178
179 inline long int transC
180 (long int Coord, int Frame, long int Len);
181
182 inline long int refLen
183 (long int ntLen);
184
185 inline long int revC
186 (long int Coord, long int Len);
187
188 void printHelp
189 (const char *);
190
191 void printUsage
192 (const char *);
193
194 void validateData
195 (vector<Alignment> Alignments, vector<Cluster> Clusters,
196 const FastaRecord * Af, const FastaRecord * Bf);
197
198
199
200 //-------------------------------------------------- Function Definitions ----//
201 int main
202 (int argc, char ** argv)
203 {
204 FastaRecord * Af; // array of all the reference sequences
205
206 vector<Synteny> Syntenys; // vector of all sets of clusters
207 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
208 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
209
210 Synteny Asyn; // a single set of clusters
211 Cluster Aclu; // a single cluster of matches
212 Match Amat; // a single match
213
214 LineType PrevLine; // the current input line
215
216 bool Found; // temporary search flag
217
218 char Line [MAX_LINE]; // a single line of input
219 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
220 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
221 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
222
223 int FrameA, FrameB; // reading frames (1-6)
224 int CurrFrameA, CurrFrameB;
225
226 long int i; // temporary counter
227 long int Seqi; // current reference sequence index
228 long int InitSize; // initial sequence memory space
229 long int As = 0; // the size of the reference array
230 long int Ac = 100; // the capacity of the reference array
231 long int sA, sB, len; // current match start in A, B and length
232
233 FILE * RefFile, * QryFile; // reference and query input files
234 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
235
236 //-- Set the alignment data type and break length (sw_align.h)
237 setMatrixType ( BLOSUM62 );
238 setBreakLen ( 60 );
239
240 //-- Parse the command line arguments
241 {
242 optarg = NULL;
243 int ch, errflg = 0;
244 while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) )
245 switch (ch)
246 {
247 case 'b' :
248 setBreakLen (atoi (optarg));
249 break;
250
251 case 'd' :
252 DO_DELTA = false;
253 break;
254
255 case 'e' :
256 DO_EXTEND = false;
257 break;
258
259 case 'h' :
260 printHelp (argv[0]);
261 exit (EXIT_SUCCESS);
262 break;
263
264 case 't' :
265 TO_SEQEND = true;
266 break;
267
268 case 'x' :
269 setMatrixType ( atoi (optarg) );
270 if ( getMatrixType( ) == NUCLEOTIDE )
271 {
272 fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n",
273 getMatrixType( ));
274 setMatrixType ( BLOSUM62 );
275 }
276 break;
277
278 default :
279 errflg ++;
280 }
281
282 if ( errflg > 0 || argc - optind != 3 )
283 {
284 printUsage (argv[0]);
285 exit (EXIT_FAILURE);
286 }
287 }
288
289 //-- Read and create the I/O file names
290 strcpy (RefFileName, argv[optind ++]);
291 strcpy (QryFileName, argv[optind ++]);
292 strcpy (ClusterFileName, argv[optind ++]);
293 strcpy (DeltaFileName, ClusterFileName);
294 strcat (ClusterFileName, ".cluster");
295 strcat (DeltaFileName, ".delta");
296
297 //-- Open all the files
298 RefFile = File_Open (RefFileName, "r");
299 QryFile = File_Open (QryFileName, "r");
300 if ( DO_DELTA )
301 {
302 ClusterFile = NULL;
303 DeltaFile = File_Open (DeltaFileName, "w");
304 }
305 else
306 {
307 ClusterFile = File_Open (ClusterFileName, "w");
308 DeltaFile = NULL;
309 }
310
311 //-- Print the headers of the output files
312 if ( DO_DELTA )
313 fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
314 else
315 fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
316
317 //-- Generate the array of the reference sequences
318 InitSize = INIT_SIZE;
319 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
320 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
321 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
322 {
323 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
324 strcpy (Af[As].Id, IdA);
325
326 Af[As].len = strlen (Af[As].seq + 1);
327
328 if ( ++ As >= Ac )
329 {
330 Ac *= 2;
331 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
332 }
333
334 InitSize = INIT_SIZE;
335 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
336 }
337 fclose (RefFile);
338 if ( As <= 0 )
339 parseAbort ( RefFileName );
340
341
342
343 //-- Process the input from <stdin> line by line
344 PrevLine = NO_LINE;
345 IdA[0] = '\0';
346 IdB[0] = '\0';
347 FrameA = 0;
348 FrameB = 0;
349 CurrFrameA = -1;
350 CurrFrameB = -1;
351 CurrIdB[0] = '\0';
352 while ( fgets(Line, MAX_LINE, stdin) != NULL )
353 {
354
355 //-- If the current line is a fasta HEADER_LINE
356 if ( Line[0] == '>' )
357 {
358 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
359 parseAbort ("stdin");
360
361 sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB);
362 CurrIdB [strlen(CurrIdB) - 2] = '\0';
363
364 PrevLine = HEADER_LINE;
365 }
366
367
368 //-- If the current line is a cluster HEADER_LINE
369 else if ( Line[0] == '#' )
370 {
371 PrevLine = HEADER_LINE;
372 }
373
374
375 //-- If the current line is a MATCH_LINE
376 else
377 {
378 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
379 parseAbort ("stdin");
380
381 //-- Re-map the reference coordinate back to its original sequence
382 // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length
383 // including the appended 'X' on each frame translation
384 for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ )
385 sA -= refLen (Af[Seqi].len);
386
387 assert ( Seqi < As );
388
389 //-- Get the correct frame, translate startA coordinate to frame
390 for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ )
391 sA -= (Af[Seqi].len - (i % 3)) / 3 + 1;
392 CurrFrameA = i + 1;
393 assert ( CurrFrameA > 0 && CurrFrameA < 7 );
394
395 //-- If the match spans across a frame boundry
396 if ( CurrFrameA < 1 || CurrFrameA > 6 ||
397 sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 ||
398 sA <= 0 )
399 {
400 fprintf (stderr,
401 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
402 " Reference sequence '>%s', frame %d\n"
403 "Please file a bug report\n"
404 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA);
405 continue;
406 }
407
408 //-- If the match spans across a sequence boundry
409 if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 )
410 {
411 fprintf (stderr,
412 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
413 " Reference sequence '>%s'\n"
414 "Please file a bug report\n"
415 "Attempting to continue.\n", Af[Seqi].Id);
416 continue;
417 }
418
419 //-- Check and update the current synteny region
420 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ||
421 CurrFrameA != FrameA || CurrFrameB != FrameB )
422 {
423 Found = false;
424 if ( strcmp (IdB, CurrIdB) == 0 )
425 {
426 //-- Has this header been seen before?
427 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
428 {
429 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
430 {
431 assert ( Sp->AfP->len == Af[Seqi].len );
432 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
433 CurrSp = Sp;
434 Found = true;
435 break;
436 }
437 }
438 }
439 else
440 {
441 //-- New B sequence header, process all the old synteny's
442 processSyntenys (Syntenys, Af, As,
443 QryFile, ClusterFile, DeltaFile);
444
445 }
446
447 strcpy (IdA, Af[Seqi].Id);
448 strcpy (IdB, CurrIdB);
449 FrameA = CurrFrameA;
450 FrameB = CurrFrameB;
451
452 //-- If not seen yet, create a new synteny region
453 if ( ! Found )
454 {
455 Asyn.AfP = Af + Seqi;
456 Asyn.Bf.len = -1;
457 Asyn.Bf.Id = (char *) Safe_malloc
458 (sizeof(char) * (strlen(IdB) + 1));
459 strcpy (Asyn.Bf.Id, IdB);
460
461 Syntenys.push_back (Asyn);
462 CurrSp = Syntenys.rbegin( );
463 }
464
465 //-- Add a new cluster to the current synteny
466 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
467 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
468 CurrSp->clusters.pop_back( ); // hack to remove empties
469 Aclu.frameA = FrameA;
470 Aclu.frameB = FrameB;
471 Aclu.wasFused = false;
472 CurrSp->clusters.push_back (Aclu);
473 }
474 else if ( PrevLine == HEADER_LINE )
475 {
476 //-- Add a new cluster to the current synteny
477 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
478 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
479 CurrSp->clusters.pop_back( );
480 Aclu.frameA = FrameA;
481 Aclu.frameB = FrameB;
482 Aclu.wasFused = false;
483 CurrSp->clusters.push_back (Aclu);
484 }
485
486 //-- Add a new match to the current cluster
487 // NOTE: A and B coordinates still reference the appropriate
488 // amino acid sequence frame, not the DNA (same with len)
489 if ( len > 1 )
490 {
491 Amat.sA = sA;
492 Amat.sB = sB;
493 Amat.len = len;
494 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
495 }
496
497 PrevLine = MATCH_LINE;
498 }
499 }
500
501 //-- Process the left-over syntenys
502 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
503 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
504 CurrSp->clusters.pop_back( );
505 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
506 fclose (QryFile);
507
508 //-- Free the reference sequences
509 for ( i = 0; i < As; i ++ )
510 {
511 free (Af[i].Id);
512 free (Af[i].seq);
513 }
514 free (Af);
515
516 return EXIT_SUCCESS;
517 }
518
519
520
521
522 void addNewAlignment
523 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
524 vector<Match>::iterator Mp)
525
526 // Create and add a new alignment object based on the current match
527 // and cluster information pointed to by Cp and Mp.
528
529 {
530 Alignment Align;
531
532 //-- Initialize the data
533 Align.sA = Mp->sA;
534 Align.sB = Mp->sB;
535 Align.eA = Mp->sA + Mp->len - 1;
536 Align.eB = Mp->sB + Mp->len - 1;
537 Align.frameA = Cp->frameA;
538 Align.frameB = Cp->frameB;
539 Align.delta.clear( );
540 Align.deltaApos = 0;
541
542 //-- Add the alignment object
543 Alignments.push_back (Align);
544
545 return;
546 }
547
548
549
550
551 bool extendBackward
552 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
553 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
554
555 // Extend an alignment backwards off of the current alignment object.
556 // The current alignment object must be freshly created and consist
557 // only of an exact match (i.e. the delta vector MUST be empty).
558 // If the TargetAp alignment object is reached by the extension, it will
559 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
560 // NULL the function will extend as far as possible. It is a strange
561 // and dangerous function because it can delete CurrAp, so edit with
562 // caution. Returns true if TargetAp was reached and merged, else false
563 // Designed only as a subroutine for extendClusters, should be used
564 // nowhere else.
565
566 {
567 bool target_reached = false;
568 bool overflow_flag = false;
569 bool double_flag = false;
570
571 vector<long int>::iterator Dp;
572
573 unsigned int m_o;
574 long int targetA, targetB;
575
576 m_o = BACKWARD_SEARCH;
577
578 //-- Set the target coordinates
579 if ( TargetAp != Alignments.end( ) )
580 {
581 targetA = TargetAp->eA;
582 targetB = TargetAp->eB;
583 }
584 else
585 {
586 targetA = 1;
587 targetB = 1;
588 m_o |= OPTIMAL_BIT;
589 }
590
591 //-- If alignment is too long, bring with bounds and set overflow_flag true
592 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
593 {
594 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
595 overflow_flag = true;
596 m_o |= OPTIMAL_BIT;
597 }
598 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
599 {
600 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
601 if ( overflow_flag )
602 double_flag = true;
603 else
604 overflow_flag = true;
605 m_o |= OPTIMAL_BIT;
606 }
607
608
609 if ( TO_SEQEND && !double_flag )
610 m_o |= SEQEND_BIT;
611
612
613 //-- Attempt to reach the target
614 target_reached = alignSearch (A, CurrAp->sA, targetA,
615 B, CurrAp->sB, targetB, m_o);
616
617 if ( overflow_flag || TargetAp == Alignments.end( ) )
618 target_reached = false;
619
620 if ( target_reached )
621 {
622 //-- assert that CurrAp is new and at the end of the Alignment vector
623 assert ( CurrAp->delta.empty( ) );
624 assert ( CurrAp == Alignments.end( ) - 1 );
625
626 //-- Merge the two alignment objects
627 extendForward (TargetAp, A, CurrAp->sA,
628 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
629 TargetAp->eA = CurrAp->eA;
630 TargetAp->eB = CurrAp->eB;
631 Alignments.pop_back( );
632 }
633 else
634 {
635 alignTarget (A, targetA, CurrAp->sA,
636 B, targetB, CurrAp->sB,
637 CurrAp->delta, FORCED_FORWARD_ALIGN);
638 CurrAp->sA = targetA;
639 CurrAp->sB = targetB;
640
641 //-- Update the deltaApos value for the alignment object
642 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
643 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
644 }
645
646 return target_reached;
647 }
648
649
650
651
652 bool extendForward
653 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
654 const char * B, long int targetB, unsigned int m_o)
655
656 // Extend an alignment forwards off the current alignment object until
657 // target or end of sequence is reached, and merge the delta values of the
658 // alignment object with the new delta values generated by the extension.
659 // Return true if the target was reached, else false
660
661 {
662 long int ValA;
663 long int ValB;
664 unsigned int Di;
665 bool target_reached;
666 bool overflow_flag = false;
667 bool double_flag = false;
668 vector<long int>::iterator Dp;
669
670 //-- Set Di to the end of the delta vector
671 Di = CurrAp->delta.size( );
672
673 //-- Calculate the distance between the start and end positions
674 ValA = targetA - CurrAp->eA + 1;
675 ValB = targetB - CurrAp->eB + 1;
676
677 //-- If the distance is too long, shrink it and set the overflow_flag
678 if ( ValA > MAX_ALIGNMENT_LENGTH )
679 {
680 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
681 overflow_flag = true;
682 m_o |= OPTIMAL_BIT;
683 }
684 if ( ValB > MAX_ALIGNMENT_LENGTH )
685 {
686 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
687 if ( overflow_flag )
688 double_flag = true;
689 else
690 overflow_flag = true;
691 m_o |= OPTIMAL_BIT;
692 }
693
694 if ( double_flag )
695 m_o &= ~SEQEND_BIT;
696
697 target_reached = alignTarget (A, CurrAp->eA, targetA,
698 B, CurrAp->eB, targetB,
699 CurrAp->delta, m_o);
700
701 //-- Notify user if alignment was chopped short
702 if ( target_reached && overflow_flag )
703 target_reached = false;
704
705 //-- Pick up delta where left off (Di) and merge with new deltas
706 if ( Di < CurrAp->delta.size( ) )
707 {
708 //-- Merge the deltas
709 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
710 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
711 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
712 {
713 fprintf(stderr,
714 "ERROR: failed to merge alignments at position %ld\n"
715 " Please file a bug report\n",
716 CurrAp->eA);
717 exit (EXIT_FAILURE);
718 }
719
720 //-- Update the deltaApos
721 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
722 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
723 }
724
725 //-- Set the alignment coordinates
726 CurrAp->eA = targetA;
727 CurrAp->eB = targetB;
728
729 return target_reached;
730 }
731
732
733
734
735 void extendClusters
736 (vector<Cluster> & Clusters,
737 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
738
739 // Connect all the matches in every cluster between sequences A and B.
740 // Also, extend alignments off of the front and back of each cluster to
741 // expand total alignment coverage. When these extensions encounter an
742 // adjacent cluster (in a consistent frame), fuse the two regions to
743 // create one single encompassing region. This routine will create
744 // alignment objects from these extensions and output the resulting
745 // delta information to the delta output file. Also, all coordintes
746 // for the clusters and the alignment regions are translated to reference
747 // the original DNA sequecne.
748
749 {
750 //-- Sort the clusters (ascending) by their start coordinate in sequence A
751 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
752
753 //-- If no delta file is requested
754 if ( ! DO_DELTA )
755 return;
756
757
758 bool target_reached = false; // reached the adjacent match or cluster
759 // the amino acid sequences for A and B
760 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
761 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
762
763 long int Alen [7], Blen [7]; // the length of the amino acid sequences
764 int i;
765
766 unsigned int m_o;
767 long int targetA, targetB; // alignment extension targets in A and B
768
769 vector<Match>::iterator Mp; // match pointer
770
771 vector<Cluster>::iterator PrevCp; // where the extensions last left off
772 vector<Cluster>::iterator CurrCp; // the current cluster being extended
773 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
774
775 vector<Alignment> Alignments; // the vector of alignment objects
776 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
777 vector<Alignment>::iterator TargetAp; // target align
778
779 //-- Calculate the length of each amino acid frame translation
780 for ( i = 0; i < 6; i ++ )
781 {
782 Alen [i + 1] = (Af->len - (i % 3)) / 3;
783 Blen [i + 1] = (Bf->len - (i % 3)) / 3;
784 }
785
786 //-- Extend each cluster
787 PrevCp = Clusters.begin( );
788 CurrCp = Clusters.begin( );
789 while ( CurrCp < Clusters.end( ) )
790 {
791 if ( DO_EXTEND )
792 {
793 //-- Ignore if shadowed or already extended
794 if ( ! target_reached )
795 if ( CurrCp->wasFused ||
796 isShadowedCluster (CurrCp, Alignments, CurrAp) )
797 {
798 CurrCp->wasFused = true;
799 CurrCp = ++ PrevCp;
800 continue;
801 }
802 }
803
804 //-- Initialize the right amino acid sequence for A and B if need be
805 if ( A [CurrCp->frameA] == NULL )
806 {
807 A [CurrCp->frameA] = (char *) Safe_malloc
808 ( sizeof(char) * ( (Af->len / 3) + 2) );
809 A [CurrCp->frameA] [0] = '\0';
810 Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA],
811 CurrCp->frameA );
812 }
813 if ( B [CurrCp->frameB] == NULL )
814 {
815 B [CurrCp->frameB] = (char *) Safe_malloc
816 ( sizeof(char) * ( (Bf->len / 3) + 2) );
817 B [CurrCp->frameB] [0] = '\0';
818 Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB],
819 CurrCp->frameB );
820 }
821
822 //-- Extend each match in the cluster
823 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
824 {
825 //-- Try to extend the current match backwards
826 if ( target_reached )
827 {
828 //-- Merge with the previous match
829 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
830 {
831 //-- Strip matches until the targeted match is found
832 assert (Mp < CurrCp->matches.end( ) - 1);
833 continue;
834 }
835
836 CurrAp->eA += Mp->len - 1;
837 CurrAp->eB += Mp->len - 1;
838 }
839 else
840 {
841 //-- Create a new alignment object
842 addNewAlignment (Alignments, CurrCp, Mp);
843 CurrAp = Alignments.end( ) - 1;
844
845 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
846 {
847 //-- Target the closest/best alignment object
848 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
849
850 //-- Extend the new alignment object backwards
851 if ( extendBackward (Alignments, CurrAp, TargetAp,
852 A [CurrCp->frameA], B [CurrCp->frameB]) )
853 CurrAp = TargetAp;
854 }
855 }
856
857 m_o = FORWARD_ALIGN;
858
859 //-- Try to extend the current match forwards
860 if ( Mp < CurrCp->matches.end( ) - 1 )
861 {
862 //-- Target the next match in the cluster
863 targetA = (Mp + 1)->sA;
864 targetB = (Mp + 1)->sB;
865
866 //-- Extend the current alignment object forward
867 target_reached = extendForward (CurrAp,
868 A [CurrCp->frameA], targetA,
869 B [CurrCp->frameB], targetB, m_o);
870 }
871 else if ( DO_EXTEND )
872 {
873 targetA = Alen [CurrCp->frameA];
874 targetB = Blen [CurrCp->frameB];
875
876 //-- Target the closest/best match in a future cluster
877 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
878 targetA, targetB);
879
880 if ( TargetCp == Clusters.end( ) )
881 {
882 m_o |= OPTIMAL_BIT;
883 if ( TO_SEQEND )
884 m_o |= SEQEND_BIT;
885 }
886
887 //-- Extend the current alignment object forward
888 target_reached = extendForward (CurrAp,
889 A [CurrCp->frameA], targetA,
890 B [CurrCp->frameB], targetB, m_o);
891 }
892 }
893
894 if ( TargetCp == Clusters.end( ) )
895 target_reached = false;
896
897 CurrCp->wasFused = true;
898
899 if ( target_reached == false )
900 CurrCp = ++ PrevCp;
901 else
902 CurrCp = TargetCp;
903 }
904
905 #ifdef _DEBUG_ASSERT
906 validateData (Alignments, Clusters, Af, Bf);
907 #endif
908
909 //-- Output the alignment data to the delta file
910 flushAlignments (Alignments, Af, Bf, DeltaFile);
911
912 for ( i = 0; i < 7; i ++ )
913 {
914 if ( A [i] != NULL )
915 free ( A [i] );
916 if ( B [i] != NULL )
917 free ( B [i] );
918 }
919
920 return;
921 }
922
923
924
925
926 void flushAlignments
927 (vector<Alignment> & Alignments,
928 const FastaRecord * Af, const FastaRecord * Bf,
929 FILE * DeltaFile)
930
931 // Simply output the delta information stored in Alignments to the
932 // given delta file. Free the memory used by Alignments once the
933 // data is successfully output to the file.
934
935 {
936 vector<Alignment>::iterator Ap; // alignment pointer
937 vector<long int>::iterator Dp; // delta pointer
938
939 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
940
941 //-- Generate the error counts
942 parseDelta (Alignments, Af, Bf);
943
944 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
945 {
946 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
947 transC (Ap->sA, Ap->frameA, Af->len),
948 transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2),
949 transC (Ap->sB, Ap->frameB, Bf->len),
950 transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2),
951 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
952
953 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
954 fprintf (DeltaFile, "%ld\n", *Dp);
955 fprintf (DeltaFile, "0\n");
956
957 Ap->delta.clear( );
958 }
959
960 Alignments.clear( );
961
962 return;
963 }
964
965
966
967
968 void flushSyntenys
969 (vector<Synteny> & Syntenys, FILE * ClusterFile)
970
971 // Simply output the synteny/cluster information generated by the mgaps
972 // program. However, now the coordinates reference their appropriate
973 // reference sequence, and the reference sequecne header is added to
974 // the appropriate lines. Free the memory used by Syntenys once the
975 // data is successfully output to the file.
976
977 {
978 vector<Synteny>::iterator Sp; // synteny pointer
979 vector<Cluster>::iterator Cp; // cluster pointer
980 vector<Match>::iterator Mp; // match pointer
981
982 if ( ClusterFile )
983 {
984 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
985 {
986 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
987 Sp->AfP->len, Sp->Bf.len);
988 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
989 {
990 fprintf (ClusterFile, "%2d %2d\n",
991 Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA,
992 Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB);
993 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
994 {
995 fprintf (ClusterFile, "%8ld %8ld %6ld",
996 transC (Mp->sA, Cp->frameA, Sp->AfP->len),
997 transC (Mp->sB, Cp->frameB, Sp->Bf.len),
998 Mp->len * 3);
999
1000 if ( Mp != Cp->matches.begin( ) )
1001 fprintf (ClusterFile, "%6ld %6ld\n",
1002 (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3,
1003 (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3);
1004 else
1005 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
1006 }
1007 Cp->matches.clear( );
1008 }
1009 Sp->clusters.clear( );
1010 }
1011 }
1012 else
1013 {
1014 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
1015 {
1016 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
1017 {
1018 Cp->matches.clear( );
1019 }
1020 Sp->clusters.clear( );
1021 }
1022 }
1023
1024 Syntenys.clear( );
1025
1026 return;
1027 }
1028
1029
1030
1031
1032
1033 vector<Cluster>::iterator getForwardTargetCluster
1034 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
1035 long int & targetA, long int & targetB)
1036
1037 // Return the cluster that is most likely to successfully join (in a
1038 // forward direction) with the current cluster. The returned cluster
1039 // must contain 1 or more matches that are strictly greater than the end
1040 // of the current cluster. The targeted cluster must also be on a
1041 // diagonal close enough to the current cluster, so that a connection
1042 // could possibly be made by the alignment extender. Also, this targeted
1043 // cluster must be consistent in frame with the current cluster. Assumes
1044 // clusters have been sorted via AscendingClusterSort. Returns targeted
1045 // cluster and stores the target coordinates in targetA and targetB. If no
1046 // suitable cluster was found, the function will return NULL and target
1047 // A and targetB will remain unchanged.
1048
1049 {
1050 vector<Match>::iterator Mip; // match iteratrive pointer
1051 vector<Cluster>::iterator Cp; // cluster pointer
1052 vector<Cluster>::iterator Cip; // cluster iterative pointer
1053 long int eA, eB; // possible target
1054 long int greater, lesser; // gap sizes between two clusters
1055 long int sA = CurrCp->matches.rbegin( )->sA +
1056 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
1057 long int sB = CurrCp->matches.rbegin( )->sB +
1058 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
1059
1060 //-- End of sequences is the default target, set distance accordingly
1061 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
1062
1063 //-- For all clusters greater than the current cluster (on sequence A)
1064 Cp = Clusters.end( );
1065 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
1066 {
1067 //-- If the cluster is in the same frame
1068 if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB )
1069 {
1070 eA = Cip->matches.begin( )->sA;
1071 eB = Cip->matches.begin( )->sB;
1072
1073 //-- If the cluster overlaps the current cluster, strip some matches
1074 if ( ( eA < sA || eB < sB ) &&
1075 Cip->matches.rbegin( )->sA >= sA &&
1076 Cip->matches.rbegin( )->sB >= sB )
1077 {
1078 for ( Mip = Cip->matches.begin( );
1079 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
1080 Mip ++ )
1081 {
1082 eA = Mip->sA;
1083 eB = Mip->sB;
1084 }
1085 }
1086
1087 //-- If the cluster is strictly greater than current cluster
1088 if ( eA >= sA && eB >= sB )
1089 {
1090 if ( eA - sA > eB - sB )
1091 {
1092 greater = eA - sA;
1093 lesser = eB - sB;
1094 }
1095 else
1096 {
1097 lesser = eA - sA;
1098 greater = eB - sB;
1099 }
1100
1101 //-- If the cluster is close enough
1102 if ( greater <= getBreakLen( ) ||
1103 (lesser) * GOOD_SCORE [getMatrixType( )] +
1104 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
1105 {
1106 Cp = Cip;
1107 targetA = eA;
1108 targetB = eB;
1109 break;
1110 }
1111 else if ( (greater << 1) - lesser < dist )
1112 {
1113 Cp = Cip;
1114 targetA = eA;
1115 targetB = eB;
1116 dist = (greater << 1) - lesser;
1117 }
1118 }
1119 }
1120 }
1121
1122
1123 return Cp;
1124 }
1125
1126
1127
1128
1129
1130 vector<Alignment>::iterator getReverseTargetAlignment
1131 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
1132
1133 // Return the alignment that is most likely to successfully join (in a
1134 // reverse direction) with the current alignment. The returned alignment
1135 // must be strictly less than the current cluster and be on a diagonal
1136 // close enough to the current alignment, so that a connection
1137 // could possibly be made by the alignment extender. Also, this targeted
1138 // alignment must be consistent in frame with the current alignment.
1139 // Assumes clusters have been sorted via AscendingClusterSort and
1140 // processed in order, so therefore all alignments are in order by their
1141 // start A coordinate.
1142
1143 {
1144 vector<Alignment>::iterator Ap; // alignment pointer
1145 vector<Alignment>::iterator Aip; // alignment iterative pointer
1146 long int eA, eB; // possible targets
1147 long int greater, lesser; // gap sizes between the two alignments
1148 long int sA = CurrAp->sA; // the startA of the current alignment
1149 long int sB = CurrAp->sB; // the startB of the current alignment
1150
1151 //-- Beginning of sequences is the default target, set distance accordingly
1152 long int dist = (sA < sB ? sA : sB);
1153
1154 //-- For all alignments less than the current alignment (on sequence A)
1155 Ap = Alignments.end( );
1156 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
1157 {
1158 //-- If the alignment is on the same direction
1159 if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB )
1160 {
1161 eA = Aip->eA;
1162 eB = Aip->eB;
1163
1164 //-- If the alignment is strictly greater than current cluster
1165 if ( eA <= sA && eB <= sB )
1166 {
1167 if ( sA - eA > sB - eB )
1168 {
1169 greater = sA - eA;
1170 lesser = sB - eB;
1171 }
1172 else
1173 {
1174 lesser = sA - eA;
1175 greater = sB - eB;
1176 }
1177
1178 //-- If the cluster is close enough
1179 if ( greater <= getBreakLen( ) ||
1180 (lesser) * GOOD_SCORE [getMatrixType( )] +
1181 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
1182 {
1183 Ap = Aip;
1184 break;
1185 }
1186 else if ( (greater << 1) - lesser < dist )
1187 {
1188 Ap = Aip;
1189 dist = (greater << 1) - lesser;
1190 }
1191 }
1192 }
1193 }
1194
1195
1196 return Ap;
1197 }
1198
1199
1200
1201
1202
1203 bool isShadowedCluster
1204 (vector<Cluster>::iterator CurrCp,
1205 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
1206
1207 // Check if the current cluster is shadowed by a previously produced
1208 // alignment region. Return true if it is, else false.
1209
1210 {
1211 vector<Alignment>::iterator Aip; // alignment pointer
1212
1213 long int sA = CurrCp->matches.begin( )->sA;
1214 long int eA = CurrCp->matches.rbegin( )->sA +
1215 CurrCp->matches.rbegin( )->len - 1;
1216 long int sB = CurrCp->matches.begin( )->sB;
1217 long int eB = CurrCp->matches.rbegin( )->sB +
1218 CurrCp->matches.rbegin( )->len - 1;
1219
1220 if ( ! Alignments.empty( ) ) // if there are alignments to use
1221 {
1222 //-- Look backwards in hope of finding a shadowing alignment
1223 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
1224 {
1225 //-- If in the same frames and shadowing the current cluster, break
1226 if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB )
1227 if ( Aip->eA >= eA && Aip->eB >= eB )
1228 if ( Aip->sA <= sA && Aip->sB <= sB )
1229 break;
1230 }
1231
1232 //-- Return true if the loop was broken, i.e. shadow found
1233 if ( Aip >= Alignments.begin( ) )
1234 return true;
1235 }
1236
1237 //-- Return false if Alignments was empty or loop was not broken
1238 return false;
1239 }
1240
1241
1242
1243
1244
1245
1246 void parseAbort
1247 (const char * s)
1248
1249 // Abort the program if there was an error in parsing file 's'
1250
1251 {
1252 fprintf (stderr,
1253 "ERROR: Could not parse input from '%s'. \n"
1254 "Please check the filename and format, or file a bug report\n", s);
1255 exit (EXIT_FAILURE);
1256 }
1257
1258
1259
1260
1261 void parseDelta
1262 (vector<Alignment> & Alignments,
1263 const FastaRecord * Af, const FastaRecord *Bf)
1264
1265 // Use the delta information to generate the error counts for each
1266 // alignment, and fill this information into the data type
1267
1268 {
1269 // pointers to Af.seq and Bf.seq
1270 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
1271 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
1272
1273 char ch1, ch2;
1274 long int Delta;
1275 int Sign;
1276 long int i, Apos, Bpos;
1277 long int Remain, Total;
1278 long int Errors, SimErrors;
1279 long int NonAlphas;
1280 vector<Alignment>::iterator Ap;
1281 vector<long int>::iterator Dp;
1282
1283 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1284 {
1285 if ( A [Ap->frameA] == NULL )
1286 {
1287 A [Ap->frameA] = (char *) Safe_malloc
1288 ( sizeof(char) * ( (Af->len / 3) + 2 ) );
1289 A [Ap->frameA] [0] = '\0';
1290 Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA );
1291 }
1292 if ( B [Ap->frameB] == NULL )
1293 {
1294 B [Ap->frameB] = (char *) Safe_malloc
1295 ( sizeof(char) * ( (Bf->len / 3) + 2) );
1296 B [Ap->frameB] [0] = '\0';
1297 Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB );
1298 }
1299
1300 Apos = Ap->sA;
1301 Bpos = Ap->sB;
1302
1303 Errors = 0;
1304 SimErrors = 0;
1305 NonAlphas = 0;
1306 Remain = Ap->eA - Ap->sA + 1;
1307 Total = Remain;
1308
1309 //-- For all delta's in this alignment
1310 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
1311 {
1312 Delta = *Dp;
1313 Sign = Delta > 0 ? 1 : -1;
1314 Delta = labs ( Delta );
1315
1316 //-- For all the bases before the next indel
1317 for ( i = 1; i < Delta; i ++ )
1318 {
1319 ch1 = A [Ap->frameA] [Apos ++];
1320 ch2 = B [Ap->frameB] [Bpos ++];
1321
1322 if ( !isalpha (ch1) )
1323 {
1324 ch1 = STOP_CHAR;
1325 NonAlphas ++;
1326 }
1327 if ( !isalpha (ch2) )
1328 {
1329 ch2 = STOP_CHAR;
1330 NonAlphas ++;
1331 }
1332
1333 ch1 = toupper(ch1);
1334 ch2 = toupper(ch2);
1335 if ( 1 > MATCH_SCORE
1336 [getMatrixType( )]
1337 [ch1 - 'A']
1338 [ch2 - 'A'] )
1339 SimErrors ++;
1340 if ( ch1 != ch2 )
1341 Errors ++;
1342 }
1343
1344 //-- Process the current indel
1345 Remain -= i - 1;
1346 Errors ++;
1347 SimErrors ++;
1348
1349 if ( Sign == 1 )
1350 {
1351 if ( !isalpha (A [Ap->frameA] [Apos ++]) )
1352 NonAlphas ++;
1353 Remain --;
1354 }
1355 else
1356 {
1357 if ( !isalpha (B [Ap->frameB] [Bpos ++]) )
1358 NonAlphas ++;
1359 Total ++;
1360 }
1361 }
1362
1363 //-- For all the bases after the final indel
1364 for ( i = 0; i < Remain; i ++ )
1365 {
1366 //-- Score character match and update error counters
1367 ch1 = A [Ap->frameA] [Apos ++];
1368 ch2 = B [Ap->frameB] [Bpos ++];
1369
1370 if ( !isalpha (ch1) )
1371 {
1372 ch1 = STOP_CHAR;
1373 NonAlphas ++;
1374 }
1375 if ( !isalpha (ch2) )
1376 {
1377 ch2 = STOP_CHAR;
1378 NonAlphas ++;
1379 }
1380
1381 ch1 = toupper(ch1);
1382 ch2 = toupper(ch2);
1383 if ( 1 > MATCH_SCORE
1384 [getMatrixType( )]
1385 [ch1 - 'A']
1386 [ch2 - 'A'] )
1387 SimErrors ++;
1388 if ( ch1 != ch2 )
1389 Errors ++;
1390 }
1391
1392 Ap->Errors = Errors;
1393 Ap->SimErrors = SimErrors;
1394 Ap->NonAlphas = NonAlphas;
1395 }
1396
1397 for ( i = 1; i <= 6; i ++ )
1398 {
1399 if ( A [i] != NULL )
1400 free ( A [i] );
1401 A [i] = NULL;
1402
1403 if ( B [i] != NULL )
1404 free ( B [i] );
1405 B [i] = NULL;
1406 }
1407
1408 return;
1409 }
1410
1411
1412
1413
1414 void processSyntenys
1415 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
1416 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
1417
1418 // For each syntenic region with clusters, read in the B sequence and
1419 // extend the clusters to expand total alignment coverage. Clusters should
1420 // still reference the amino acid concatenations, the translation back to
1421 // DNA will be taken care of by the extendClusters function.
1422 // processSyntenys should ONLY be called once all the clusters for
1423 // the contained syntenic regions have been stored in the data structure.
1424 // Frees the memory used by the the syntenic regions once the output of
1425 // extendClusters and flushSyntenys has been produced.
1426
1427 {
1428 FastaRecord Bf; // the current B sequence
1429 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
1430
1431 long int InitSize = INIT_SIZE; // the initial memory capacity of B
1432 vector<Synteny>::iterator CurrSp; // current synteny pointer
1433
1434 //-- Initialize the B sequence storage
1435 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
1436
1437 //-- For all the contained syntenys
1438 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
1439 {
1440 //-- If no clusters, ignore
1441 if ( CurrSp->clusters.empty( ) )
1442 continue;
1443
1444 //-- If a B sequence not seen yet, read it in
1445 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
1446 // ordered as output by mgaps, if they are not in order the program
1447 // will fail. (All like tags must be adjacent and in the same order
1448 // as the query file)
1449 if ( CurrSp == Syntenys.begin( ) ||
1450 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
1451 {
1452 //-- Read in the B sequence
1453 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
1454 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
1455 break;
1456 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
1457 {
1458 fprintf(stderr,"%s\n", CurrSp->Bf.Id);
1459 parseAbort ( "Query File" );
1460 }
1461 Bf.len = strlen (Bf.seq + 1);
1462 }
1463
1464 //-- Extend clusters and create the alignment information
1465 CurrSp->Bf.len = Bf.len;
1466 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
1467 }
1468
1469 //-- Create the cluster information and clear the Synteny structure
1470 flushSyntenys (Syntenys, ClusterFile);
1471
1472 free (Bf.Id);
1473 free (Bf.seq);
1474
1475 return;
1476 }
1477
1478
1479
1480 inline long int transC
1481 (long int Coord, int Frame, long int Len)
1482
1483 // Translate an amino acid coordinate to a nucleotide coordinate
1484 // relative to the forward strand. The Len parameter should be the
1485 // length of the original DNA strand.
1486
1487 {
1488 assert ( Frame > 0 && Frame < 7 );
1489 if ( Frame > 3 )
1490 return revC ( (Coord * 3) - (3 - (Frame - 3)), Len );
1491 else
1492 return (Coord * 3) - (3 - (Frame));
1493 }
1494
1495
1496
1497
1498 inline long int refLen
1499 (long int ntLen)
1500
1501 // Return the length of the concatenated amino acid sequence frames,
1502 // including the appended 'X' at the end of each sequence frame for the
1503 // given DNA input length. (The returned length will be the length of
1504 // the concatenated frames for this sequence as output by prepro.cc)
1505
1506 {
1507 return (ntLen + 1) << 1;
1508 }
1509
1510
1511
1512
1513 inline long int revC
1514 (long int Coord, long int Len)
1515
1516 // Reverse complement the given coordinate for the given length.
1517
1518 {
1519 assert (Len - Coord + 1 > 0);
1520 return (Len - Coord + 1);
1521 }
1522
1523
1524
1525
1526 void printHelp
1527 (const char * s)
1528
1529 // Display the program's help information to stderr.
1530
1531 {
1532 fprintf (stderr,
1533 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
1534 fprintf (stderr,
1535 "-b int set the alignment break (give-up) length to int (amino acids)\n"
1536 "-d output only match clusters rather than extended alignments\n"
1537 "-e do not extend alignments outward from clusters\n"
1538 "-h display help information\n"
1539 "-t force alignment to ends of sequence if within -b distance\n"
1540 "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n"
1541 " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n");
1542 fprintf (stderr,
1543 " Input is the output of the \"mgaps\" program from stdin, and\n"
1544 "the two original PROmer sequence files passed on the command\n"
1545 "line. <pfx> is the prefix to be added to the front of the\n"
1546 "output file <pfx>.delta\n"
1547 " <pfx>.delta is the alignment object that catalogs the distance\n"
1548 "between insertions and deletions. For further information\n"
1549 "regarding this file, please refer to the documentation under\n"
1550 "the .delta output description.\n\n");
1551 return;
1552 }
1553
1554
1555
1556
1557 void printUsage
1558 (const char * s)
1559
1560 // Display the program's usage information to stderr.
1561
1562 {
1563 fprintf (stderr,
1564 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
1565 fprintf (stderr, "Try '%s -h' for more information.\n", s);
1566 return;
1567 }
1568
1569
1570
1571
1572 void validateData
1573 (vector<Alignment> Alignments, vector<Cluster> Clusters,
1574 const FastaRecord * Af, const FastaRecord * Bf)
1575
1576 // Self test function to check that the cluster and alignment information
1577 // is valid
1578
1579 {
1580 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
1581 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
1582
1583 long int Alen [7], Blen [7]; // the length of the amino acid sequences
1584 vector<Cluster>::iterator Cp;
1585 vector<Match>::iterator Mp;
1586 vector<Alignment>::iterator Ap;
1587 long int x, y, i;
1588 char Xc, Yc;
1589
1590 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
1591 {
1592 assert ( Cp->wasFused );
1593
1594 //-- Initialize the right amino acid sequence for A and B if need be
1595 if ( A [Cp->frameA] == NULL )
1596 {
1597 A [Cp->frameA] = (char *) Safe_malloc
1598 ( sizeof(char) * ( (Af->len / 3) + 2) );
1599 A [Cp->frameA] [0] = '\0';
1600 Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA],
1601 Cp->frameA );
1602 }
1603 if ( B [Cp->frameB] == NULL )
1604 {
1605 B [Cp->frameB] = (char *) Safe_malloc
1606 ( sizeof(char) * ( (Bf->len / 3) + 2) );
1607 B [Cp->frameB] [0] = '\0';
1608 Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB],
1609 Cp->frameB );
1610 }
1611
1612 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
1613 {
1614 //-- assert for each match in cluster, it is indeed a match
1615 x = Mp->sA;
1616 y = Mp->sB;
1617 for ( i = 0; i < Mp->len; i ++ )
1618 assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] );
1619
1620 //-- assert for each match in cluster, it is contained in an alignment
1621 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1622 {
1623 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
1624 Ap->eA >= Mp->sA + Mp->len - 1 &&
1625 Ap->eB >= Mp->sB + Mp->len - 1 )
1626 break;
1627 }
1628 assert ( Ap < Alignments.end( ) );
1629 }
1630 }
1631
1632 //-- assert alignments are optimal (quick check if first and last chars equal)
1633 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
1634 {
1635 assert ( Ap->sA <= Ap->eA );
1636 assert ( Ap->sB <= Ap->eB );
1637
1638 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
1639 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
1640 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
1641 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
1642
1643 Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ?
1644 A[Ap->frameA][Ap->sA] : STOP_CHAR);
1645 Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ?
1646 B[Ap->frameB][Ap->sB] : STOP_CHAR);
1647 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
1648
1649 Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ?
1650 A[Ap->frameA][Ap->eA] : STOP_CHAR);
1651 Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ?
1652 B[Ap->frameB][Ap->eB] : STOP_CHAR);
1653 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
1654 }
1655
1656 for ( i = 0; i < 7; i ++ )
1657 {
1658 if ( A [i] != NULL )
1659 free ( A [i] );
1660 if ( B [i] != NULL )
1661 free ( B [i] );
1662 }
1663
1664 return;
1665 }