Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 //------------------------------------------------------------------------------ | |
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research | |
3 // File: postpro.cc | |
4 // Date: 08 / 20 / 2002 | |
5 // | |
6 // Purpose: To translate the coordinates referencing the concatenated | |
7 // reference sequences back to the individual sequences and deal | |
8 // with any conflict that may have arisen (i.e. a MUM that spans | |
9 // the boundry between two sequences). Then to extend each cluster | |
10 // via Smith-Waterman techniques to expand the alignment coverage. | |
11 // Alignments which encounter each other will be fused to form one | |
12 // encompasing alignment when appropriate. | |
13 // | |
14 // Input: Input is the output of the .mgaps program from stdin. On the | |
15 // command line, the file names of the two original sequence files | |
16 // should come first, followed by the prefix <pfx> that should be | |
17 // placed in front of the two output filenames <pfx>.cluster and | |
18 // <pfx>.delta | |
19 // | |
20 // NOTE: Cluster file is now suppressed by default (see -d option). | |
21 // | |
22 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta. | |
23 // <pfx>.cluster lists MUM clusters as identified by "mgaps". | |
24 // However, the clusters now reference their corresponding | |
25 // sequence and are all listed under headers that specify this | |
26 // sequence. In addition, the coordinates now reference the original | |
27 // DNA input and not the amino acid translations. The <pfx>.delta file | |
28 // is the alignment object that contains all the information necessary | |
29 // to reproduce the alignments generated by the MUM extension process. | |
30 // The coordinates in this file reference the original DNA input, however | |
31 // the delta values represent amino acids, so 1 delta = 3 nucleotides. | |
32 // Please refer to the output file documentation for an in-depth | |
33 // description of these file formats. | |
34 // | |
35 // Usage: postpro <reference> <query> <pfx> < <input> | |
36 // Where <reference> and <query> are the original input sequences of | |
37 // PROmer and <pfx> is the prefix that should be added to the | |
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames. | |
39 // | |
40 //------------------------------------------------------------------------------ | |
41 | |
42 //-- NOTE: this option will significantly hamper program performance, | |
43 // mostly the alignment extension performance (sw_align.h) | |
44 //#define _DEBUG_ASSERT // self testing assert functions | |
45 | |
46 #include "tigrinc.hh" | |
47 #include "sw_align.hh" | |
48 #include "translate.hh" | |
49 #include <vector> | |
50 #include <algorithm> | |
51 using namespace std; | |
52 | |
53 | |
54 | |
55 | |
56 //------------------------------------------------------ Globals -------------// | |
57 bool DO_DELTA = true; | |
58 bool DO_EXTEND = true; | |
59 bool TO_SEQEND = false; | |
60 | |
61 | |
62 //------------------------------------------------------ Type Definitions ----// | |
63 enum LineType | |
64 //-- The type of input line from <stdin> | |
65 { | |
66 NO_LINE, HEADER_LINE, MATCH_LINE | |
67 }; | |
68 | |
69 | |
70 struct FastaRecord | |
71 //-- The essential data of a sequence | |
72 { | |
73 char * Id; // the fasta ID header tag | |
74 long int len; // the length of the sequence | |
75 char * seq; // the sequence data | |
76 }; | |
77 | |
78 | |
79 struct Match | |
80 //-- An exact match between two sequences A and B | |
81 { | |
82 long int sA, sB, len; // start coordinate in A, in B and the length | |
83 }; | |
84 | |
85 | |
86 struct Cluster | |
87 //-- An ordered list of matches between two sequences A and B | |
88 { | |
89 bool wasFused; // have the cluster matches been extended yet? | |
90 int frameA; // the reference sequence frame (1-6) | |
91 int frameB; // the query sequence frame | |
92 vector<Match> matches; // the ordered set of matches in the cluster | |
93 }; | |
94 | |
95 | |
96 struct Synteny | |
97 //-- An ordered list of clusters between two sequences A and B | |
98 { | |
99 FastaRecord * AfP; // a pointer to the reference sequence record | |
100 FastaRecord Bf; // the query sequence record (w/o the sequence) | |
101 vector<Cluster> clusters; // the ordered set of clusters between A and B | |
102 }; | |
103 | |
104 | |
105 struct Alignment | |
106 //-- An alignment object between two sequences A and B | |
107 { | |
108 int frameA; // the reference sequence frame (1-6) | |
109 int frameB; // the query sequence frame | |
110 long int sA, sB, eA, eB; // the start in A, B and the end in A, B | |
111 vector<long int> delta; // the delta values, with NO zero at the end | |
112 long int deltaApos; // sum of abs(deltas) - #of negative deltas | |
113 // trust me, it is a very helpful value | |
114 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas | |
115 }; | |
116 | |
117 | |
118 struct AscendingClusterSort | |
119 //-- For sorting clusters in ascending order of their sA coordinate | |
120 { | |
121 bool operator() (const Cluster & pA, const Cluster & pB) | |
122 { | |
123 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); | |
124 } | |
125 }; | |
126 | |
127 | |
128 | |
129 | |
130 | |
131 //------------------------------------------------- Function Declarations ----// | |
132 void addNewAlignment | |
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, | |
134 vector<Match>::iterator Mp); | |
135 | |
136 bool extendBackward | |
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, | |
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B); | |
139 | |
140 bool extendForward | |
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA, | |
142 const char * B, long int targetB, unsigned int m_o); | |
143 | |
144 void extendClusters | |
145 (vector<Cluster> & Clusters, | |
146 const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile); | |
147 | |
148 void flushAlignments | |
149 (vector<Alignment> & Alignments, | |
150 const FastaRecord * Af, const FastaRecord * Bf, | |
151 FILE * DeltaFile); | |
152 | |
153 void flushSyntenys | |
154 (vector<Synteny> & Syntenys, FILE * ClusterFile); | |
155 | |
156 vector<Cluster>::iterator getForwardTargetCluster | |
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, | |
158 long int & targetA, long int & targetB); | |
159 | |
160 vector<Alignment>::iterator getReverseTargetAlignment | |
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp); | |
162 | |
163 bool isShadowedCluster | |
164 (vector<Cluster>::iterator CurrCp, | |
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap); | |
166 | |
167 void parseAbort | |
168 (const char *); | |
169 | |
170 void parseDelta | |
171 (vector<Alignment> & Alignments, | |
172 const FastaRecord * Af, const FastaRecord *Bf); | |
173 | |
174 void processSyntenys | |
175 (vector<Synteny> & Syntenys, | |
176 FastaRecord * Af, long int As, | |
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); | |
178 | |
179 inline long int transC | |
180 (long int Coord, int Frame, long int Len); | |
181 | |
182 inline long int refLen | |
183 (long int ntLen); | |
184 | |
185 inline long int revC | |
186 (long int Coord, long int Len); | |
187 | |
188 void printHelp | |
189 (const char *); | |
190 | |
191 void printUsage | |
192 (const char *); | |
193 | |
194 void validateData | |
195 (vector<Alignment> Alignments, vector<Cluster> Clusters, | |
196 const FastaRecord * Af, const FastaRecord * Bf); | |
197 | |
198 | |
199 | |
200 //-------------------------------------------------- Function Definitions ----// | |
201 int main | |
202 (int argc, char ** argv) | |
203 { | |
204 FastaRecord * Af; // array of all the reference sequences | |
205 | |
206 vector<Synteny> Syntenys; // vector of all sets of clusters | |
207 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters | |
208 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer | |
209 | |
210 Synteny Asyn; // a single set of clusters | |
211 Cluster Aclu; // a single cluster of matches | |
212 Match Amat; // a single match | |
213 | |
214 LineType PrevLine; // the current input line | |
215 | |
216 bool Found; // temporary search flag | |
217 | |
218 char Line [MAX_LINE]; // a single line of input | |
219 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers | |
220 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names | |
221 char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; | |
222 | |
223 int FrameA, FrameB; // reading frames (1-6) | |
224 int CurrFrameA, CurrFrameB; | |
225 | |
226 long int i; // temporary counter | |
227 long int Seqi; // current reference sequence index | |
228 long int InitSize; // initial sequence memory space | |
229 long int As = 0; // the size of the reference array | |
230 long int Ac = 100; // the capacity of the reference array | |
231 long int sA, sB, len; // current match start in A, B and length | |
232 | |
233 FILE * RefFile, * QryFile; // reference and query input files | |
234 FILE * ClusterFile, * DeltaFile; // cluster and delta output files | |
235 | |
236 //-- Set the alignment data type and break length (sw_align.h) | |
237 setMatrixType ( BLOSUM62 ); | |
238 setBreakLen ( 60 ); | |
239 | |
240 //-- Parse the command line arguments | |
241 { | |
242 optarg = NULL; | |
243 int ch, errflg = 0; | |
244 while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) ) | |
245 switch (ch) | |
246 { | |
247 case 'b' : | |
248 setBreakLen (atoi (optarg)); | |
249 break; | |
250 | |
251 case 'd' : | |
252 DO_DELTA = false; | |
253 break; | |
254 | |
255 case 'e' : | |
256 DO_EXTEND = false; | |
257 break; | |
258 | |
259 case 'h' : | |
260 printHelp (argv[0]); | |
261 exit (EXIT_SUCCESS); | |
262 break; | |
263 | |
264 case 't' : | |
265 TO_SEQEND = true; | |
266 break; | |
267 | |
268 case 'x' : | |
269 setMatrixType ( atoi (optarg) ); | |
270 if ( getMatrixType( ) == NUCLEOTIDE ) | |
271 { | |
272 fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n", | |
273 getMatrixType( )); | |
274 setMatrixType ( BLOSUM62 ); | |
275 } | |
276 break; | |
277 | |
278 default : | |
279 errflg ++; | |
280 } | |
281 | |
282 if ( errflg > 0 || argc - optind != 3 ) | |
283 { | |
284 printUsage (argv[0]); | |
285 exit (EXIT_FAILURE); | |
286 } | |
287 } | |
288 | |
289 //-- Read and create the I/O file names | |
290 strcpy (RefFileName, argv[optind ++]); | |
291 strcpy (QryFileName, argv[optind ++]); | |
292 strcpy (ClusterFileName, argv[optind ++]); | |
293 strcpy (DeltaFileName, ClusterFileName); | |
294 strcat (ClusterFileName, ".cluster"); | |
295 strcat (DeltaFileName, ".delta"); | |
296 | |
297 //-- Open all the files | |
298 RefFile = File_Open (RefFileName, "r"); | |
299 QryFile = File_Open (QryFileName, "r"); | |
300 if ( DO_DELTA ) | |
301 { | |
302 ClusterFile = NULL; | |
303 DeltaFile = File_Open (DeltaFileName, "w"); | |
304 } | |
305 else | |
306 { | |
307 ClusterFile = File_Open (ClusterFileName, "w"); | |
308 DeltaFile = NULL; | |
309 } | |
310 | |
311 //-- Print the headers of the output files | |
312 if ( DO_DELTA ) | |
313 fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName); | |
314 else | |
315 fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName); | |
316 | |
317 //-- Generate the array of the reference sequences | |
318 InitSize = INIT_SIZE; | |
319 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); | |
320 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
321 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) | |
322 { | |
323 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); | |
324 strcpy (Af[As].Id, IdA); | |
325 | |
326 Af[As].len = strlen (Af[As].seq + 1); | |
327 | |
328 if ( ++ As >= Ac ) | |
329 { | |
330 Ac *= 2; | |
331 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); | |
332 } | |
333 | |
334 InitSize = INIT_SIZE; | |
335 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
336 } | |
337 fclose (RefFile); | |
338 if ( As <= 0 ) | |
339 parseAbort ( RefFileName ); | |
340 | |
341 | |
342 | |
343 //-- Process the input from <stdin> line by line | |
344 PrevLine = NO_LINE; | |
345 IdA[0] = '\0'; | |
346 IdB[0] = '\0'; | |
347 FrameA = 0; | |
348 FrameB = 0; | |
349 CurrFrameA = -1; | |
350 CurrFrameB = -1; | |
351 CurrIdB[0] = '\0'; | |
352 while ( fgets(Line, MAX_LINE, stdin) != NULL ) | |
353 { | |
354 | |
355 //-- If the current line is a fasta HEADER_LINE | |
356 if ( Line[0] == '>' ) | |
357 { | |
358 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) | |
359 parseAbort ("stdin"); | |
360 | |
361 sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB); | |
362 CurrIdB [strlen(CurrIdB) - 2] = '\0'; | |
363 | |
364 PrevLine = HEADER_LINE; | |
365 } | |
366 | |
367 | |
368 //-- If the current line is a cluster HEADER_LINE | |
369 else if ( Line[0] == '#' ) | |
370 { | |
371 PrevLine = HEADER_LINE; | |
372 } | |
373 | |
374 | |
375 //-- If the current line is a MATCH_LINE | |
376 else | |
377 { | |
378 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) | |
379 parseAbort ("stdin"); | |
380 | |
381 //-- Re-map the reference coordinate back to its original sequence | |
382 // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length | |
383 // including the appended 'X' on each frame translation | |
384 for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ ) | |
385 sA -= refLen (Af[Seqi].len); | |
386 | |
387 assert ( Seqi < As ); | |
388 | |
389 //-- Get the correct frame, translate startA coordinate to frame | |
390 for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ ) | |
391 sA -= (Af[Seqi].len - (i % 3)) / 3 + 1; | |
392 CurrFrameA = i + 1; | |
393 assert ( CurrFrameA > 0 && CurrFrameA < 7 ); | |
394 | |
395 //-- If the match spans across a frame boundry | |
396 if ( CurrFrameA < 1 || CurrFrameA > 6 || | |
397 sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 || | |
398 sA <= 0 ) | |
399 { | |
400 fprintf (stderr, | |
401 "\nWARNING: A MUM was found extending beyond the boundry of:\n" | |
402 " Reference sequence '>%s', frame %d\n" | |
403 "Please file a bug report\n" | |
404 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA); | |
405 continue; | |
406 } | |
407 | |
408 //-- If the match spans across a sequence boundry | |
409 if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 ) | |
410 { | |
411 fprintf (stderr, | |
412 "\nWARNING: A MUM was found extending beyond the boundry of:\n" | |
413 " Reference sequence '>%s'\n" | |
414 "Please file a bug report\n" | |
415 "Attempting to continue.\n", Af[Seqi].Id); | |
416 continue; | |
417 } | |
418 | |
419 //-- Check and update the current synteny region | |
420 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 || | |
421 CurrFrameA != FrameA || CurrFrameB != FrameB ) | |
422 { | |
423 Found = false; | |
424 if ( strcmp (IdB, CurrIdB) == 0 ) | |
425 { | |
426 //-- Has this header been seen before? | |
427 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) | |
428 { | |
429 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) | |
430 { | |
431 assert ( Sp->AfP->len == Af[Seqi].len ); | |
432 assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); | |
433 CurrSp = Sp; | |
434 Found = true; | |
435 break; | |
436 } | |
437 } | |
438 } | |
439 else | |
440 { | |
441 //-- New B sequence header, process all the old synteny's | |
442 processSyntenys (Syntenys, Af, As, | |
443 QryFile, ClusterFile, DeltaFile); | |
444 | |
445 } | |
446 | |
447 strcpy (IdA, Af[Seqi].Id); | |
448 strcpy (IdB, CurrIdB); | |
449 FrameA = CurrFrameA; | |
450 FrameB = CurrFrameB; | |
451 | |
452 //-- If not seen yet, create a new synteny region | |
453 if ( ! Found ) | |
454 { | |
455 Asyn.AfP = Af + Seqi; | |
456 Asyn.Bf.len = -1; | |
457 Asyn.Bf.Id = (char *) Safe_malloc | |
458 (sizeof(char) * (strlen(IdB) + 1)); | |
459 strcpy (Asyn.Bf.Id, IdB); | |
460 | |
461 Syntenys.push_back (Asyn); | |
462 CurrSp = Syntenys.rbegin( ); | |
463 } | |
464 | |
465 //-- Add a new cluster to the current synteny | |
466 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
467 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
468 CurrSp->clusters.pop_back( ); // hack to remove empties | |
469 Aclu.frameA = FrameA; | |
470 Aclu.frameB = FrameB; | |
471 Aclu.wasFused = false; | |
472 CurrSp->clusters.push_back (Aclu); | |
473 } | |
474 else if ( PrevLine == HEADER_LINE ) | |
475 { | |
476 //-- Add a new cluster to the current synteny | |
477 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
478 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
479 CurrSp->clusters.pop_back( ); | |
480 Aclu.frameA = FrameA; | |
481 Aclu.frameB = FrameB; | |
482 Aclu.wasFused = false; | |
483 CurrSp->clusters.push_back (Aclu); | |
484 } | |
485 | |
486 //-- Add a new match to the current cluster | |
487 // NOTE: A and B coordinates still reference the appropriate | |
488 // amino acid sequence frame, not the DNA (same with len) | |
489 if ( len > 1 ) | |
490 { | |
491 Amat.sA = sA; | |
492 Amat.sB = sB; | |
493 Amat.len = len; | |
494 CurrSp->clusters.rbegin( )->matches.push_back (Amat); | |
495 } | |
496 | |
497 PrevLine = MATCH_LINE; | |
498 } | |
499 } | |
500 | |
501 //-- Process the left-over syntenys | |
502 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
503 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
504 CurrSp->clusters.pop_back( ); | |
505 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); | |
506 fclose (QryFile); | |
507 | |
508 //-- Free the reference sequences | |
509 for ( i = 0; i < As; i ++ ) | |
510 { | |
511 free (Af[i].Id); | |
512 free (Af[i].seq); | |
513 } | |
514 free (Af); | |
515 | |
516 return EXIT_SUCCESS; | |
517 } | |
518 | |
519 | |
520 | |
521 | |
522 void addNewAlignment | |
523 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, | |
524 vector<Match>::iterator Mp) | |
525 | |
526 // Create and add a new alignment object based on the current match | |
527 // and cluster information pointed to by Cp and Mp. | |
528 | |
529 { | |
530 Alignment Align; | |
531 | |
532 //-- Initialize the data | |
533 Align.sA = Mp->sA; | |
534 Align.sB = Mp->sB; | |
535 Align.eA = Mp->sA + Mp->len - 1; | |
536 Align.eB = Mp->sB + Mp->len - 1; | |
537 Align.frameA = Cp->frameA; | |
538 Align.frameB = Cp->frameB; | |
539 Align.delta.clear( ); | |
540 Align.deltaApos = 0; | |
541 | |
542 //-- Add the alignment object | |
543 Alignments.push_back (Align); | |
544 | |
545 return; | |
546 } | |
547 | |
548 | |
549 | |
550 | |
551 bool extendBackward | |
552 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, | |
553 vector<Alignment>::iterator TargetAp, const char * A, const char * B) | |
554 | |
555 // Extend an alignment backwards off of the current alignment object. | |
556 // The current alignment object must be freshly created and consist | |
557 // only of an exact match (i.e. the delta vector MUST be empty). | |
558 // If the TargetAp alignment object is reached by the extension, it will | |
559 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is | |
560 // NULL the function will extend as far as possible. It is a strange | |
561 // and dangerous function because it can delete CurrAp, so edit with | |
562 // caution. Returns true if TargetAp was reached and merged, else false | |
563 // Designed only as a subroutine for extendClusters, should be used | |
564 // nowhere else. | |
565 | |
566 { | |
567 bool target_reached = false; | |
568 bool overflow_flag = false; | |
569 bool double_flag = false; | |
570 | |
571 vector<long int>::iterator Dp; | |
572 | |
573 unsigned int m_o; | |
574 long int targetA, targetB; | |
575 | |
576 m_o = BACKWARD_SEARCH; | |
577 | |
578 //-- Set the target coordinates | |
579 if ( TargetAp != Alignments.end( ) ) | |
580 { | |
581 targetA = TargetAp->eA; | |
582 targetB = TargetAp->eB; | |
583 } | |
584 else | |
585 { | |
586 targetA = 1; | |
587 targetB = 1; | |
588 m_o |= OPTIMAL_BIT; | |
589 } | |
590 | |
591 //-- If alignment is too long, bring with bounds and set overflow_flag true | |
592 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) | |
593 { | |
594 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; | |
595 overflow_flag = true; | |
596 m_o |= OPTIMAL_BIT; | |
597 } | |
598 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) | |
599 { | |
600 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; | |
601 if ( overflow_flag ) | |
602 double_flag = true; | |
603 else | |
604 overflow_flag = true; | |
605 m_o |= OPTIMAL_BIT; | |
606 } | |
607 | |
608 | |
609 if ( TO_SEQEND && !double_flag ) | |
610 m_o |= SEQEND_BIT; | |
611 | |
612 | |
613 //-- Attempt to reach the target | |
614 target_reached = alignSearch (A, CurrAp->sA, targetA, | |
615 B, CurrAp->sB, targetB, m_o); | |
616 | |
617 if ( overflow_flag || TargetAp == Alignments.end( ) ) | |
618 target_reached = false; | |
619 | |
620 if ( target_reached ) | |
621 { | |
622 //-- assert that CurrAp is new and at the end of the Alignment vector | |
623 assert ( CurrAp->delta.empty( ) ); | |
624 assert ( CurrAp == Alignments.end( ) - 1 ); | |
625 | |
626 //-- Merge the two alignment objects | |
627 extendForward (TargetAp, A, CurrAp->sA, | |
628 B, CurrAp->sB, FORCED_FORWARD_ALIGN); | |
629 TargetAp->eA = CurrAp->eA; | |
630 TargetAp->eB = CurrAp->eB; | |
631 Alignments.pop_back( ); | |
632 } | |
633 else | |
634 { | |
635 alignTarget (A, targetA, CurrAp->sA, | |
636 B, targetB, CurrAp->sB, | |
637 CurrAp->delta, FORCED_FORWARD_ALIGN); | |
638 CurrAp->sA = targetA; | |
639 CurrAp->sB = targetB; | |
640 | |
641 //-- Update the deltaApos value for the alignment object | |
642 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) | |
643 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; | |
644 } | |
645 | |
646 return target_reached; | |
647 } | |
648 | |
649 | |
650 | |
651 | |
652 bool extendForward | |
653 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA, | |
654 const char * B, long int targetB, unsigned int m_o) | |
655 | |
656 // Extend an alignment forwards off the current alignment object until | |
657 // target or end of sequence is reached, and merge the delta values of the | |
658 // alignment object with the new delta values generated by the extension. | |
659 // Return true if the target was reached, else false | |
660 | |
661 { | |
662 long int ValA; | |
663 long int ValB; | |
664 unsigned int Di; | |
665 bool target_reached; | |
666 bool overflow_flag = false; | |
667 bool double_flag = false; | |
668 vector<long int>::iterator Dp; | |
669 | |
670 //-- Set Di to the end of the delta vector | |
671 Di = CurrAp->delta.size( ); | |
672 | |
673 //-- Calculate the distance between the start and end positions | |
674 ValA = targetA - CurrAp->eA + 1; | |
675 ValB = targetB - CurrAp->eB + 1; | |
676 | |
677 //-- If the distance is too long, shrink it and set the overflow_flag | |
678 if ( ValA > MAX_ALIGNMENT_LENGTH ) | |
679 { | |
680 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; | |
681 overflow_flag = true; | |
682 m_o |= OPTIMAL_BIT; | |
683 } | |
684 if ( ValB > MAX_ALIGNMENT_LENGTH ) | |
685 { | |
686 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; | |
687 if ( overflow_flag ) | |
688 double_flag = true; | |
689 else | |
690 overflow_flag = true; | |
691 m_o |= OPTIMAL_BIT; | |
692 } | |
693 | |
694 if ( double_flag ) | |
695 m_o &= ~SEQEND_BIT; | |
696 | |
697 target_reached = alignTarget (A, CurrAp->eA, targetA, | |
698 B, CurrAp->eB, targetB, | |
699 CurrAp->delta, m_o); | |
700 | |
701 //-- Notify user if alignment was chopped short | |
702 if ( target_reached && overflow_flag ) | |
703 target_reached = false; | |
704 | |
705 //-- Pick up delta where left off (Di) and merge with new deltas | |
706 if ( Di < CurrAp->delta.size( ) ) | |
707 { | |
708 //-- Merge the deltas | |
709 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; | |
710 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); | |
711 if ( CurrAp->delta[Di] == 0 || ValA < 0 ) | |
712 { | |
713 fprintf(stderr, | |
714 "ERROR: failed to merge alignments at position %ld\n" | |
715 " Please file a bug report\n", | |
716 CurrAp->eA); | |
717 exit (EXIT_FAILURE); | |
718 } | |
719 | |
720 //-- Update the deltaApos | |
721 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) | |
722 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; | |
723 } | |
724 | |
725 //-- Set the alignment coordinates | |
726 CurrAp->eA = targetA; | |
727 CurrAp->eB = targetB; | |
728 | |
729 return target_reached; | |
730 } | |
731 | |
732 | |
733 | |
734 | |
735 void extendClusters | |
736 (vector<Cluster> & Clusters, | |
737 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) | |
738 | |
739 // Connect all the matches in every cluster between sequences A and B. | |
740 // Also, extend alignments off of the front and back of each cluster to | |
741 // expand total alignment coverage. When these extensions encounter an | |
742 // adjacent cluster (in a consistent frame), fuse the two regions to | |
743 // create one single encompassing region. This routine will create | |
744 // alignment objects from these extensions and output the resulting | |
745 // delta information to the delta output file. Also, all coordintes | |
746 // for the clusters and the alignment regions are translated to reference | |
747 // the original DNA sequecne. | |
748 | |
749 { | |
750 //-- Sort the clusters (ascending) by their start coordinate in sequence A | |
751 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); | |
752 | |
753 //-- If no delta file is requested | |
754 if ( ! DO_DELTA ) | |
755 return; | |
756 | |
757 | |
758 bool target_reached = false; // reached the adjacent match or cluster | |
759 // the amino acid sequences for A and B | |
760 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
761 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
762 | |
763 long int Alen [7], Blen [7]; // the length of the amino acid sequences | |
764 int i; | |
765 | |
766 unsigned int m_o; | |
767 long int targetA, targetB; // alignment extension targets in A and B | |
768 | |
769 vector<Match>::iterator Mp; // match pointer | |
770 | |
771 vector<Cluster>::iterator PrevCp; // where the extensions last left off | |
772 vector<Cluster>::iterator CurrCp; // the current cluster being extended | |
773 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster | |
774 | |
775 vector<Alignment> Alignments; // the vector of alignment objects | |
776 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align | |
777 vector<Alignment>::iterator TargetAp; // target align | |
778 | |
779 //-- Calculate the length of each amino acid frame translation | |
780 for ( i = 0; i < 6; i ++ ) | |
781 { | |
782 Alen [i + 1] = (Af->len - (i % 3)) / 3; | |
783 Blen [i + 1] = (Bf->len - (i % 3)) / 3; | |
784 } | |
785 | |
786 //-- Extend each cluster | |
787 PrevCp = Clusters.begin( ); | |
788 CurrCp = Clusters.begin( ); | |
789 while ( CurrCp < Clusters.end( ) ) | |
790 { | |
791 if ( DO_EXTEND ) | |
792 { | |
793 //-- Ignore if shadowed or already extended | |
794 if ( ! target_reached ) | |
795 if ( CurrCp->wasFused || | |
796 isShadowedCluster (CurrCp, Alignments, CurrAp) ) | |
797 { | |
798 CurrCp->wasFused = true; | |
799 CurrCp = ++ PrevCp; | |
800 continue; | |
801 } | |
802 } | |
803 | |
804 //-- Initialize the right amino acid sequence for A and B if need be | |
805 if ( A [CurrCp->frameA] == NULL ) | |
806 { | |
807 A [CurrCp->frameA] = (char *) Safe_malloc | |
808 ( sizeof(char) * ( (Af->len / 3) + 2) ); | |
809 A [CurrCp->frameA] [0] = '\0'; | |
810 Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA], | |
811 CurrCp->frameA ); | |
812 } | |
813 if ( B [CurrCp->frameB] == NULL ) | |
814 { | |
815 B [CurrCp->frameB] = (char *) Safe_malloc | |
816 ( sizeof(char) * ( (Bf->len / 3) + 2) ); | |
817 B [CurrCp->frameB] [0] = '\0'; | |
818 Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB], | |
819 CurrCp->frameB ); | |
820 } | |
821 | |
822 //-- Extend each match in the cluster | |
823 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) | |
824 { | |
825 //-- Try to extend the current match backwards | |
826 if ( target_reached ) | |
827 { | |
828 //-- Merge with the previous match | |
829 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) | |
830 { | |
831 //-- Strip matches until the targeted match is found | |
832 assert (Mp < CurrCp->matches.end( ) - 1); | |
833 continue; | |
834 } | |
835 | |
836 CurrAp->eA += Mp->len - 1; | |
837 CurrAp->eB += Mp->len - 1; | |
838 } | |
839 else | |
840 { | |
841 //-- Create a new alignment object | |
842 addNewAlignment (Alignments, CurrCp, Mp); | |
843 CurrAp = Alignments.end( ) - 1; | |
844 | |
845 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) | |
846 { | |
847 //-- Target the closest/best alignment object | |
848 TargetAp = getReverseTargetAlignment (Alignments, CurrAp); | |
849 | |
850 //-- Extend the new alignment object backwards | |
851 if ( extendBackward (Alignments, CurrAp, TargetAp, | |
852 A [CurrCp->frameA], B [CurrCp->frameB]) ) | |
853 CurrAp = TargetAp; | |
854 } | |
855 } | |
856 | |
857 m_o = FORWARD_ALIGN; | |
858 | |
859 //-- Try to extend the current match forwards | |
860 if ( Mp < CurrCp->matches.end( ) - 1 ) | |
861 { | |
862 //-- Target the next match in the cluster | |
863 targetA = (Mp + 1)->sA; | |
864 targetB = (Mp + 1)->sB; | |
865 | |
866 //-- Extend the current alignment object forward | |
867 target_reached = extendForward (CurrAp, | |
868 A [CurrCp->frameA], targetA, | |
869 B [CurrCp->frameB], targetB, m_o); | |
870 } | |
871 else if ( DO_EXTEND ) | |
872 { | |
873 targetA = Alen [CurrCp->frameA]; | |
874 targetB = Blen [CurrCp->frameB]; | |
875 | |
876 //-- Target the closest/best match in a future cluster | |
877 TargetCp = getForwardTargetCluster (Clusters, CurrCp, | |
878 targetA, targetB); | |
879 | |
880 if ( TargetCp == Clusters.end( ) ) | |
881 { | |
882 m_o |= OPTIMAL_BIT; | |
883 if ( TO_SEQEND ) | |
884 m_o |= SEQEND_BIT; | |
885 } | |
886 | |
887 //-- Extend the current alignment object forward | |
888 target_reached = extendForward (CurrAp, | |
889 A [CurrCp->frameA], targetA, | |
890 B [CurrCp->frameB], targetB, m_o); | |
891 } | |
892 } | |
893 | |
894 if ( TargetCp == Clusters.end( ) ) | |
895 target_reached = false; | |
896 | |
897 CurrCp->wasFused = true; | |
898 | |
899 if ( target_reached == false ) | |
900 CurrCp = ++ PrevCp; | |
901 else | |
902 CurrCp = TargetCp; | |
903 } | |
904 | |
905 #ifdef _DEBUG_ASSERT | |
906 validateData (Alignments, Clusters, Af, Bf); | |
907 #endif | |
908 | |
909 //-- Output the alignment data to the delta file | |
910 flushAlignments (Alignments, Af, Bf, DeltaFile); | |
911 | |
912 for ( i = 0; i < 7; i ++ ) | |
913 { | |
914 if ( A [i] != NULL ) | |
915 free ( A [i] ); | |
916 if ( B [i] != NULL ) | |
917 free ( B [i] ); | |
918 } | |
919 | |
920 return; | |
921 } | |
922 | |
923 | |
924 | |
925 | |
926 void flushAlignments | |
927 (vector<Alignment> & Alignments, | |
928 const FastaRecord * Af, const FastaRecord * Bf, | |
929 FILE * DeltaFile) | |
930 | |
931 // Simply output the delta information stored in Alignments to the | |
932 // given delta file. Free the memory used by Alignments once the | |
933 // data is successfully output to the file. | |
934 | |
935 { | |
936 vector<Alignment>::iterator Ap; // alignment pointer | |
937 vector<long int>::iterator Dp; // delta pointer | |
938 | |
939 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); | |
940 | |
941 //-- Generate the error counts | |
942 parseDelta (Alignments, Af, Bf); | |
943 | |
944 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
945 { | |
946 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", | |
947 transC (Ap->sA, Ap->frameA, Af->len), | |
948 transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2), | |
949 transC (Ap->sB, Ap->frameB, Bf->len), | |
950 transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2), | |
951 Ap->Errors, Ap->SimErrors, Ap->NonAlphas); | |
952 | |
953 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) | |
954 fprintf (DeltaFile, "%ld\n", *Dp); | |
955 fprintf (DeltaFile, "0\n"); | |
956 | |
957 Ap->delta.clear( ); | |
958 } | |
959 | |
960 Alignments.clear( ); | |
961 | |
962 return; | |
963 } | |
964 | |
965 | |
966 | |
967 | |
968 void flushSyntenys | |
969 (vector<Synteny> & Syntenys, FILE * ClusterFile) | |
970 | |
971 // Simply output the synteny/cluster information generated by the mgaps | |
972 // program. However, now the coordinates reference their appropriate | |
973 // reference sequence, and the reference sequecne header is added to | |
974 // the appropriate lines. Free the memory used by Syntenys once the | |
975 // data is successfully output to the file. | |
976 | |
977 { | |
978 vector<Synteny>::iterator Sp; // synteny pointer | |
979 vector<Cluster>::iterator Cp; // cluster pointer | |
980 vector<Match>::iterator Mp; // match pointer | |
981 | |
982 if ( ClusterFile ) | |
983 { | |
984 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) | |
985 { | |
986 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, | |
987 Sp->AfP->len, Sp->Bf.len); | |
988 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) | |
989 { | |
990 fprintf (ClusterFile, "%2d %2d\n", | |
991 Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA, | |
992 Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB); | |
993 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) | |
994 { | |
995 fprintf (ClusterFile, "%8ld %8ld %6ld", | |
996 transC (Mp->sA, Cp->frameA, Sp->AfP->len), | |
997 transC (Mp->sB, Cp->frameB, Sp->Bf.len), | |
998 Mp->len * 3); | |
999 | |
1000 if ( Mp != Cp->matches.begin( ) ) | |
1001 fprintf (ClusterFile, "%6ld %6ld\n", | |
1002 (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3, | |
1003 (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3); | |
1004 else | |
1005 fprintf (ClusterFile, "%6s %6s\n", "-", "-"); | |
1006 } | |
1007 Cp->matches.clear( ); | |
1008 } | |
1009 Sp->clusters.clear( ); | |
1010 } | |
1011 } | |
1012 else | |
1013 { | |
1014 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) | |
1015 { | |
1016 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) | |
1017 { | |
1018 Cp->matches.clear( ); | |
1019 } | |
1020 Sp->clusters.clear( ); | |
1021 } | |
1022 } | |
1023 | |
1024 Syntenys.clear( ); | |
1025 | |
1026 return; | |
1027 } | |
1028 | |
1029 | |
1030 | |
1031 | |
1032 | |
1033 vector<Cluster>::iterator getForwardTargetCluster | |
1034 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, | |
1035 long int & targetA, long int & targetB) | |
1036 | |
1037 // Return the cluster that is most likely to successfully join (in a | |
1038 // forward direction) with the current cluster. The returned cluster | |
1039 // must contain 1 or more matches that are strictly greater than the end | |
1040 // of the current cluster. The targeted cluster must also be on a | |
1041 // diagonal close enough to the current cluster, so that a connection | |
1042 // could possibly be made by the alignment extender. Also, this targeted | |
1043 // cluster must be consistent in frame with the current cluster. Assumes | |
1044 // clusters have been sorted via AscendingClusterSort. Returns targeted | |
1045 // cluster and stores the target coordinates in targetA and targetB. If no | |
1046 // suitable cluster was found, the function will return NULL and target | |
1047 // A and targetB will remain unchanged. | |
1048 | |
1049 { | |
1050 vector<Match>::iterator Mip; // match iteratrive pointer | |
1051 vector<Cluster>::iterator Cp; // cluster pointer | |
1052 vector<Cluster>::iterator Cip; // cluster iterative pointer | |
1053 long int eA, eB; // possible target | |
1054 long int greater, lesser; // gap sizes between two clusters | |
1055 long int sA = CurrCp->matches.rbegin( )->sA + | |
1056 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster | |
1057 long int sB = CurrCp->matches.rbegin( )->sB + | |
1058 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster | |
1059 | |
1060 //-- End of sequences is the default target, set distance accordingly | |
1061 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); | |
1062 | |
1063 //-- For all clusters greater than the current cluster (on sequence A) | |
1064 Cp = Clusters.end( ); | |
1065 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) | |
1066 { | |
1067 //-- If the cluster is in the same frame | |
1068 if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB ) | |
1069 { | |
1070 eA = Cip->matches.begin( )->sA; | |
1071 eB = Cip->matches.begin( )->sB; | |
1072 | |
1073 //-- If the cluster overlaps the current cluster, strip some matches | |
1074 if ( ( eA < sA || eB < sB ) && | |
1075 Cip->matches.rbegin( )->sA >= sA && | |
1076 Cip->matches.rbegin( )->sB >= sB ) | |
1077 { | |
1078 for ( Mip = Cip->matches.begin( ); | |
1079 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); | |
1080 Mip ++ ) | |
1081 { | |
1082 eA = Mip->sA; | |
1083 eB = Mip->sB; | |
1084 } | |
1085 } | |
1086 | |
1087 //-- If the cluster is strictly greater than current cluster | |
1088 if ( eA >= sA && eB >= sB ) | |
1089 { | |
1090 if ( eA - sA > eB - sB ) | |
1091 { | |
1092 greater = eA - sA; | |
1093 lesser = eB - sB; | |
1094 } | |
1095 else | |
1096 { | |
1097 lesser = eA - sA; | |
1098 greater = eB - sB; | |
1099 } | |
1100 | |
1101 //-- If the cluster is close enough | |
1102 if ( greater <= getBreakLen( ) || | |
1103 (lesser) * GOOD_SCORE [getMatrixType( )] + | |
1104 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) | |
1105 { | |
1106 Cp = Cip; | |
1107 targetA = eA; | |
1108 targetB = eB; | |
1109 break; | |
1110 } | |
1111 else if ( (greater << 1) - lesser < dist ) | |
1112 { | |
1113 Cp = Cip; | |
1114 targetA = eA; | |
1115 targetB = eB; | |
1116 dist = (greater << 1) - lesser; | |
1117 } | |
1118 } | |
1119 } | |
1120 } | |
1121 | |
1122 | |
1123 return Cp; | |
1124 } | |
1125 | |
1126 | |
1127 | |
1128 | |
1129 | |
1130 vector<Alignment>::iterator getReverseTargetAlignment | |
1131 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp) | |
1132 | |
1133 // Return the alignment that is most likely to successfully join (in a | |
1134 // reverse direction) with the current alignment. The returned alignment | |
1135 // must be strictly less than the current cluster and be on a diagonal | |
1136 // close enough to the current alignment, so that a connection | |
1137 // could possibly be made by the alignment extender. Also, this targeted | |
1138 // alignment must be consistent in frame with the current alignment. | |
1139 // Assumes clusters have been sorted via AscendingClusterSort and | |
1140 // processed in order, so therefore all alignments are in order by their | |
1141 // start A coordinate. | |
1142 | |
1143 { | |
1144 vector<Alignment>::iterator Ap; // alignment pointer | |
1145 vector<Alignment>::iterator Aip; // alignment iterative pointer | |
1146 long int eA, eB; // possible targets | |
1147 long int greater, lesser; // gap sizes between the two alignments | |
1148 long int sA = CurrAp->sA; // the startA of the current alignment | |
1149 long int sB = CurrAp->sB; // the startB of the current alignment | |
1150 | |
1151 //-- Beginning of sequences is the default target, set distance accordingly | |
1152 long int dist = (sA < sB ? sA : sB); | |
1153 | |
1154 //-- For all alignments less than the current alignment (on sequence A) | |
1155 Ap = Alignments.end( ); | |
1156 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) | |
1157 { | |
1158 //-- If the alignment is on the same direction | |
1159 if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB ) | |
1160 { | |
1161 eA = Aip->eA; | |
1162 eB = Aip->eB; | |
1163 | |
1164 //-- If the alignment is strictly greater than current cluster | |
1165 if ( eA <= sA && eB <= sB ) | |
1166 { | |
1167 if ( sA - eA > sB - eB ) | |
1168 { | |
1169 greater = sA - eA; | |
1170 lesser = sB - eB; | |
1171 } | |
1172 else | |
1173 { | |
1174 lesser = sA - eA; | |
1175 greater = sB - eB; | |
1176 } | |
1177 | |
1178 //-- If the cluster is close enough | |
1179 if ( greater <= getBreakLen( ) || | |
1180 (lesser) * GOOD_SCORE [getMatrixType( )] + | |
1181 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) | |
1182 { | |
1183 Ap = Aip; | |
1184 break; | |
1185 } | |
1186 else if ( (greater << 1) - lesser < dist ) | |
1187 { | |
1188 Ap = Aip; | |
1189 dist = (greater << 1) - lesser; | |
1190 } | |
1191 } | |
1192 } | |
1193 } | |
1194 | |
1195 | |
1196 return Ap; | |
1197 } | |
1198 | |
1199 | |
1200 | |
1201 | |
1202 | |
1203 bool isShadowedCluster | |
1204 (vector<Cluster>::iterator CurrCp, | |
1205 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap) | |
1206 | |
1207 // Check if the current cluster is shadowed by a previously produced | |
1208 // alignment region. Return true if it is, else false. | |
1209 | |
1210 { | |
1211 vector<Alignment>::iterator Aip; // alignment pointer | |
1212 | |
1213 long int sA = CurrCp->matches.begin( )->sA; | |
1214 long int eA = CurrCp->matches.rbegin( )->sA + | |
1215 CurrCp->matches.rbegin( )->len - 1; | |
1216 long int sB = CurrCp->matches.begin( )->sB; | |
1217 long int eB = CurrCp->matches.rbegin( )->sB + | |
1218 CurrCp->matches.rbegin( )->len - 1; | |
1219 | |
1220 if ( ! Alignments.empty( ) ) // if there are alignments to use | |
1221 { | |
1222 //-- Look backwards in hope of finding a shadowing alignment | |
1223 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) | |
1224 { | |
1225 //-- If in the same frames and shadowing the current cluster, break | |
1226 if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB ) | |
1227 if ( Aip->eA >= eA && Aip->eB >= eB ) | |
1228 if ( Aip->sA <= sA && Aip->sB <= sB ) | |
1229 break; | |
1230 } | |
1231 | |
1232 //-- Return true if the loop was broken, i.e. shadow found | |
1233 if ( Aip >= Alignments.begin( ) ) | |
1234 return true; | |
1235 } | |
1236 | |
1237 //-- Return false if Alignments was empty or loop was not broken | |
1238 return false; | |
1239 } | |
1240 | |
1241 | |
1242 | |
1243 | |
1244 | |
1245 | |
1246 void parseAbort | |
1247 (const char * s) | |
1248 | |
1249 // Abort the program if there was an error in parsing file 's' | |
1250 | |
1251 { | |
1252 fprintf (stderr, | |
1253 "ERROR: Could not parse input from '%s'. \n" | |
1254 "Please check the filename and format, or file a bug report\n", s); | |
1255 exit (EXIT_FAILURE); | |
1256 } | |
1257 | |
1258 | |
1259 | |
1260 | |
1261 void parseDelta | |
1262 (vector<Alignment> & Alignments, | |
1263 const FastaRecord * Af, const FastaRecord *Bf) | |
1264 | |
1265 // Use the delta information to generate the error counts for each | |
1266 // alignment, and fill this information into the data type | |
1267 | |
1268 { | |
1269 // pointers to Af.seq and Bf.seq | |
1270 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
1271 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
1272 | |
1273 char ch1, ch2; | |
1274 long int Delta; | |
1275 int Sign; | |
1276 long int i, Apos, Bpos; | |
1277 long int Remain, Total; | |
1278 long int Errors, SimErrors; | |
1279 long int NonAlphas; | |
1280 vector<Alignment>::iterator Ap; | |
1281 vector<long int>::iterator Dp; | |
1282 | |
1283 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1284 { | |
1285 if ( A [Ap->frameA] == NULL ) | |
1286 { | |
1287 A [Ap->frameA] = (char *) Safe_malloc | |
1288 ( sizeof(char) * ( (Af->len / 3) + 2 ) ); | |
1289 A [Ap->frameA] [0] = '\0'; | |
1290 Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA ); | |
1291 } | |
1292 if ( B [Ap->frameB] == NULL ) | |
1293 { | |
1294 B [Ap->frameB] = (char *) Safe_malloc | |
1295 ( sizeof(char) * ( (Bf->len / 3) + 2) ); | |
1296 B [Ap->frameB] [0] = '\0'; | |
1297 Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB ); | |
1298 } | |
1299 | |
1300 Apos = Ap->sA; | |
1301 Bpos = Ap->sB; | |
1302 | |
1303 Errors = 0; | |
1304 SimErrors = 0; | |
1305 NonAlphas = 0; | |
1306 Remain = Ap->eA - Ap->sA + 1; | |
1307 Total = Remain; | |
1308 | |
1309 //-- For all delta's in this alignment | |
1310 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) | |
1311 { | |
1312 Delta = *Dp; | |
1313 Sign = Delta > 0 ? 1 : -1; | |
1314 Delta = labs ( Delta ); | |
1315 | |
1316 //-- For all the bases before the next indel | |
1317 for ( i = 1; i < Delta; i ++ ) | |
1318 { | |
1319 ch1 = A [Ap->frameA] [Apos ++]; | |
1320 ch2 = B [Ap->frameB] [Bpos ++]; | |
1321 | |
1322 if ( !isalpha (ch1) ) | |
1323 { | |
1324 ch1 = STOP_CHAR; | |
1325 NonAlphas ++; | |
1326 } | |
1327 if ( !isalpha (ch2) ) | |
1328 { | |
1329 ch2 = STOP_CHAR; | |
1330 NonAlphas ++; | |
1331 } | |
1332 | |
1333 ch1 = toupper(ch1); | |
1334 ch2 = toupper(ch2); | |
1335 if ( 1 > MATCH_SCORE | |
1336 [getMatrixType( )] | |
1337 [ch1 - 'A'] | |
1338 [ch2 - 'A'] ) | |
1339 SimErrors ++; | |
1340 if ( ch1 != ch2 ) | |
1341 Errors ++; | |
1342 } | |
1343 | |
1344 //-- Process the current indel | |
1345 Remain -= i - 1; | |
1346 Errors ++; | |
1347 SimErrors ++; | |
1348 | |
1349 if ( Sign == 1 ) | |
1350 { | |
1351 if ( !isalpha (A [Ap->frameA] [Apos ++]) ) | |
1352 NonAlphas ++; | |
1353 Remain --; | |
1354 } | |
1355 else | |
1356 { | |
1357 if ( !isalpha (B [Ap->frameB] [Bpos ++]) ) | |
1358 NonAlphas ++; | |
1359 Total ++; | |
1360 } | |
1361 } | |
1362 | |
1363 //-- For all the bases after the final indel | |
1364 for ( i = 0; i < Remain; i ++ ) | |
1365 { | |
1366 //-- Score character match and update error counters | |
1367 ch1 = A [Ap->frameA] [Apos ++]; | |
1368 ch2 = B [Ap->frameB] [Bpos ++]; | |
1369 | |
1370 if ( !isalpha (ch1) ) | |
1371 { | |
1372 ch1 = STOP_CHAR; | |
1373 NonAlphas ++; | |
1374 } | |
1375 if ( !isalpha (ch2) ) | |
1376 { | |
1377 ch2 = STOP_CHAR; | |
1378 NonAlphas ++; | |
1379 } | |
1380 | |
1381 ch1 = toupper(ch1); | |
1382 ch2 = toupper(ch2); | |
1383 if ( 1 > MATCH_SCORE | |
1384 [getMatrixType( )] | |
1385 [ch1 - 'A'] | |
1386 [ch2 - 'A'] ) | |
1387 SimErrors ++; | |
1388 if ( ch1 != ch2 ) | |
1389 Errors ++; | |
1390 } | |
1391 | |
1392 Ap->Errors = Errors; | |
1393 Ap->SimErrors = SimErrors; | |
1394 Ap->NonAlphas = NonAlphas; | |
1395 } | |
1396 | |
1397 for ( i = 1; i <= 6; i ++ ) | |
1398 { | |
1399 if ( A [i] != NULL ) | |
1400 free ( A [i] ); | |
1401 A [i] = NULL; | |
1402 | |
1403 if ( B [i] != NULL ) | |
1404 free ( B [i] ); | |
1405 B [i] = NULL; | |
1406 } | |
1407 | |
1408 return; | |
1409 } | |
1410 | |
1411 | |
1412 | |
1413 | |
1414 void processSyntenys | |
1415 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, | |
1416 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) | |
1417 | |
1418 // For each syntenic region with clusters, read in the B sequence and | |
1419 // extend the clusters to expand total alignment coverage. Clusters should | |
1420 // still reference the amino acid concatenations, the translation back to | |
1421 // DNA will be taken care of by the extendClusters function. | |
1422 // processSyntenys should ONLY be called once all the clusters for | |
1423 // the contained syntenic regions have been stored in the data structure. | |
1424 // Frees the memory used by the the syntenic regions once the output of | |
1425 // extendClusters and flushSyntenys has been produced. | |
1426 | |
1427 { | |
1428 FastaRecord Bf; // the current B sequence | |
1429 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); | |
1430 | |
1431 long int InitSize = INIT_SIZE; // the initial memory capacity of B | |
1432 vector<Synteny>::iterator CurrSp; // current synteny pointer | |
1433 | |
1434 //-- Initialize the B sequence storage | |
1435 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
1436 | |
1437 //-- For all the contained syntenys | |
1438 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) | |
1439 { | |
1440 //-- If no clusters, ignore | |
1441 if ( CurrSp->clusters.empty( ) ) | |
1442 continue; | |
1443 | |
1444 //-- If a B sequence not seen yet, read it in | |
1445 //-- IMPORTANT: The B sequences in the synteny object are assumed to be | |
1446 // ordered as output by mgaps, if they are not in order the program | |
1447 // will fail. (All like tags must be adjacent and in the same order | |
1448 // as the query file) | |
1449 if ( CurrSp == Syntenys.begin( ) || | |
1450 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) | |
1451 { | |
1452 //-- Read in the B sequence | |
1453 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) | |
1454 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) | |
1455 break; | |
1456 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) | |
1457 { | |
1458 fprintf(stderr,"%s\n", CurrSp->Bf.Id); | |
1459 parseAbort ( "Query File" ); | |
1460 } | |
1461 Bf.len = strlen (Bf.seq + 1); | |
1462 } | |
1463 | |
1464 //-- Extend clusters and create the alignment information | |
1465 CurrSp->Bf.len = Bf.len; | |
1466 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); | |
1467 } | |
1468 | |
1469 //-- Create the cluster information and clear the Synteny structure | |
1470 flushSyntenys (Syntenys, ClusterFile); | |
1471 | |
1472 free (Bf.Id); | |
1473 free (Bf.seq); | |
1474 | |
1475 return; | |
1476 } | |
1477 | |
1478 | |
1479 | |
1480 inline long int transC | |
1481 (long int Coord, int Frame, long int Len) | |
1482 | |
1483 // Translate an amino acid coordinate to a nucleotide coordinate | |
1484 // relative to the forward strand. The Len parameter should be the | |
1485 // length of the original DNA strand. | |
1486 | |
1487 { | |
1488 assert ( Frame > 0 && Frame < 7 ); | |
1489 if ( Frame > 3 ) | |
1490 return revC ( (Coord * 3) - (3 - (Frame - 3)), Len ); | |
1491 else | |
1492 return (Coord * 3) - (3 - (Frame)); | |
1493 } | |
1494 | |
1495 | |
1496 | |
1497 | |
1498 inline long int refLen | |
1499 (long int ntLen) | |
1500 | |
1501 // Return the length of the concatenated amino acid sequence frames, | |
1502 // including the appended 'X' at the end of each sequence frame for the | |
1503 // given DNA input length. (The returned length will be the length of | |
1504 // the concatenated frames for this sequence as output by prepro.cc) | |
1505 | |
1506 { | |
1507 return (ntLen + 1) << 1; | |
1508 } | |
1509 | |
1510 | |
1511 | |
1512 | |
1513 inline long int revC | |
1514 (long int Coord, long int Len) | |
1515 | |
1516 // Reverse complement the given coordinate for the given length. | |
1517 | |
1518 { | |
1519 assert (Len - Coord + 1 > 0); | |
1520 return (Len - Coord + 1); | |
1521 } | |
1522 | |
1523 | |
1524 | |
1525 | |
1526 void printHelp | |
1527 (const char * s) | |
1528 | |
1529 // Display the program's help information to stderr. | |
1530 | |
1531 { | |
1532 fprintf (stderr, | |
1533 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); | |
1534 fprintf (stderr, | |
1535 "-b int set the alignment break (give-up) length to int (amino acids)\n" | |
1536 "-d output only match clusters rather than extended alignments\n" | |
1537 "-e do not extend alignments outward from clusters\n" | |
1538 "-h display help information\n" | |
1539 "-t force alignment to ends of sequence if within -b distance\n" | |
1540 "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n" | |
1541 " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n"); | |
1542 fprintf (stderr, | |
1543 " Input is the output of the \"mgaps\" program from stdin, and\n" | |
1544 "the two original PROmer sequence files passed on the command\n" | |
1545 "line. <pfx> is the prefix to be added to the front of the\n" | |
1546 "output file <pfx>.delta\n" | |
1547 " <pfx>.delta is the alignment object that catalogs the distance\n" | |
1548 "between insertions and deletions. For further information\n" | |
1549 "regarding this file, please refer to the documentation under\n" | |
1550 "the .delta output description.\n\n"); | |
1551 return; | |
1552 } | |
1553 | |
1554 | |
1555 | |
1556 | |
1557 void printUsage | |
1558 (const char * s) | |
1559 | |
1560 // Display the program's usage information to stderr. | |
1561 | |
1562 { | |
1563 fprintf (stderr, | |
1564 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); | |
1565 fprintf (stderr, "Try '%s -h' for more information.\n", s); | |
1566 return; | |
1567 } | |
1568 | |
1569 | |
1570 | |
1571 | |
1572 void validateData | |
1573 (vector<Alignment> Alignments, vector<Cluster> Clusters, | |
1574 const FastaRecord * Af, const FastaRecord * Bf) | |
1575 | |
1576 // Self test function to check that the cluster and alignment information | |
1577 // is valid | |
1578 | |
1579 { | |
1580 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
1581 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; | |
1582 | |
1583 long int Alen [7], Blen [7]; // the length of the amino acid sequences | |
1584 vector<Cluster>::iterator Cp; | |
1585 vector<Match>::iterator Mp; | |
1586 vector<Alignment>::iterator Ap; | |
1587 long int x, y, i; | |
1588 char Xc, Yc; | |
1589 | |
1590 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) | |
1591 { | |
1592 assert ( Cp->wasFused ); | |
1593 | |
1594 //-- Initialize the right amino acid sequence for A and B if need be | |
1595 if ( A [Cp->frameA] == NULL ) | |
1596 { | |
1597 A [Cp->frameA] = (char *) Safe_malloc | |
1598 ( sizeof(char) * ( (Af->len / 3) + 2) ); | |
1599 A [Cp->frameA] [0] = '\0'; | |
1600 Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA], | |
1601 Cp->frameA ); | |
1602 } | |
1603 if ( B [Cp->frameB] == NULL ) | |
1604 { | |
1605 B [Cp->frameB] = (char *) Safe_malloc | |
1606 ( sizeof(char) * ( (Bf->len / 3) + 2) ); | |
1607 B [Cp->frameB] [0] = '\0'; | |
1608 Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB], | |
1609 Cp->frameB ); | |
1610 } | |
1611 | |
1612 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) | |
1613 { | |
1614 //-- assert for each match in cluster, it is indeed a match | |
1615 x = Mp->sA; | |
1616 y = Mp->sB; | |
1617 for ( i = 0; i < Mp->len; i ++ ) | |
1618 assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] ); | |
1619 | |
1620 //-- assert for each match in cluster, it is contained in an alignment | |
1621 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1622 { | |
1623 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && | |
1624 Ap->eA >= Mp->sA + Mp->len - 1 && | |
1625 Ap->eB >= Mp->sB + Mp->len - 1 ) | |
1626 break; | |
1627 } | |
1628 assert ( Ap < Alignments.end( ) ); | |
1629 } | |
1630 } | |
1631 | |
1632 //-- assert alignments are optimal (quick check if first and last chars equal) | |
1633 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1634 { | |
1635 assert ( Ap->sA <= Ap->eA ); | |
1636 assert ( Ap->sB <= Ap->eB ); | |
1637 | |
1638 assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); | |
1639 assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); | |
1640 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); | |
1641 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); | |
1642 | |
1643 Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ? | |
1644 A[Ap->frameA][Ap->sA] : STOP_CHAR); | |
1645 Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ? | |
1646 B[Ap->frameB][Ap->sB] : STOP_CHAR); | |
1647 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); | |
1648 | |
1649 Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ? | |
1650 A[Ap->frameA][Ap->eA] : STOP_CHAR); | |
1651 Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ? | |
1652 B[Ap->frameB][Ap->eB] : STOP_CHAR); | |
1653 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); | |
1654 } | |
1655 | |
1656 for ( i = 0; i < 7; i ++ ) | |
1657 { | |
1658 if ( A [i] != NULL ) | |
1659 free ( A [i] ); | |
1660 if ( B [i] != NULL ) | |
1661 free ( B [i] ); | |
1662 } | |
1663 | |
1664 return; | |
1665 } |