Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postnuc.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 //------------------------------------------------------------------------------ | |
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research | |
3 // File: postnuc.cc | |
4 // Date: 07 / 16 / 2002 | |
5 // | |
6 // Revision: 08 / 01 / 2002 | |
7 // Added MUM extension functionality | |
8 // | |
9 // Purpose: To translate the coordinates referencing the concatenated | |
10 // reference sequences back to the individual sequences and deal | |
11 // with any conflict that may have arisen (i.e. a MUM that spans | |
12 // the boundry between two sequences). Then to extend each cluster | |
13 // via Smith-Waterman techniques to expand the alignment coverage. | |
14 // Alignments which encounter each other will be fused to form one | |
15 // encompasing alignment when appropriate. | |
16 // | |
17 // Input: Input is the output of the .mgaps program from stdin. On the | |
18 // command line, the file names of the two original sequence files | |
19 // should come first, followed by the prefix <pfx> that should be | |
20 // placed in front of the two output filenames <pfx>.cluster and | |
21 // <pfx>.delta | |
22 // | |
23 // NOTE: Cluster file is now suppressed by default (see -d option). | |
24 // | |
25 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta. | |
26 // <pfx>.cluster lists MUM clusters as identified by "mgaps". | |
27 // However, the clusters now reference their corresponding | |
28 // sequence and are all listed under headers that specify this | |
29 // sequence. The <pfx>.delta file is the alignment object that | |
30 // contains all the information necessary to reproduce the alignments | |
31 // generated by the MUM extension process. Please refer to the | |
32 // output file documentation for an in-depth description of these | |
33 // file formats. | |
34 // | |
35 // Usage: postnuc <reference> <query> <pfx> < <input> | |
36 // Where <reference> and <query> are the original input sequences of | |
37 // NUCmer and <pfx> is the prefix that should be added to the | |
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames. | |
39 // | |
40 //------------------------------------------------------------------------------ | |
41 | |
42 //-- NOTE: this option will significantly hamper program performance, | |
43 // mostly the alignment extension performance (sw_align.h) | |
44 //#define _DEBUG_ASSERT // self testing assert functions | |
45 | |
46 #include "tigrinc.hh" | |
47 #include "sw_align.hh" | |
48 #include <vector> | |
49 #include <algorithm> | |
50 using namespace std; | |
51 | |
52 | |
53 //------------------------------------------------------ Globals -------------// | |
54 const signed char FORWARD_CHAR = 1; | |
55 const signed char REVERSE_CHAR = -1; | |
56 | |
57 | |
58 bool DO_DELTA = true; | |
59 bool DO_EXTEND = true; | |
60 bool TO_SEQEND = false; | |
61 bool DO_SHADOWS = false; | |
62 | |
63 | |
64 //------------------------------------------------------ Type Definitions ----// | |
65 enum LineType | |
66 //-- The type of input line from <stdin> | |
67 { | |
68 NO_LINE, HEADER_LINE, MATCH_LINE | |
69 }; | |
70 | |
71 | |
72 struct FastaRecord | |
73 //-- The essential data of a sequence | |
74 { | |
75 char * Id; // the fasta ID header tag | |
76 long int len; // the length of the sequence | |
77 char * seq; // the sequence data | |
78 }; | |
79 | |
80 | |
81 struct Match | |
82 //-- An exact match between two sequences A and B | |
83 { | |
84 long int sA, sB, len; // start coordinate in A, in B and the length | |
85 }; | |
86 | |
87 | |
88 struct Cluster | |
89 //-- An ordered list of matches between two sequences A and B | |
90 { | |
91 bool wasFused; // have the cluster matches been fused yet? | |
92 signed char dirB; // the query sequence direction | |
93 // FORWARD_CHAR or REVERSE_CHAR | |
94 vector<Match> matches; // the ordered set of matches in the cluster | |
95 }; | |
96 | |
97 | |
98 struct Synteny | |
99 //-- An ordered list of clusters between two sequences A and B | |
100 { | |
101 FastaRecord * AfP; // a pointer to the reference sequence record | |
102 FastaRecord Bf; // the query sequence record (w/o the sequence) | |
103 vector<Cluster> clusters; // the ordered set of clusters between A and B | |
104 }; | |
105 | |
106 | |
107 struct Alignment | |
108 //-- An alignment object between two sequences A and B | |
109 { | |
110 signed char dirB; // the query sequence direction | |
111 long int sA, sB, eA, eB; // the start in A, B and the end in A, B | |
112 vector<long int> delta; // the delta values, with NO zero at the end | |
113 long int deltaApos; // sum of abs(deltas) - #of negative deltas | |
114 // trust me, it is a very helpful value | |
115 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas | |
116 }; | |
117 | |
118 | |
119 struct AscendingClusterSort | |
120 //-- For sorting clusters in ascending order of their sA coordinate | |
121 { | |
122 bool operator() (const Cluster & pA, const Cluster & pB) | |
123 { | |
124 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); | |
125 } | |
126 }; | |
127 | |
128 | |
129 | |
130 | |
131 //------------------------------------------------- Function Declarations ----// | |
132 void addNewAlignment | |
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, | |
134 vector<Match>::iterator Mp); | |
135 | |
136 bool extendBackward | |
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, | |
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B); | |
139 | |
140 bool extendForward | |
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA, | |
142 const char * B, long int targetB, unsigned int m_o); | |
143 | |
144 void extendClusters | |
145 (vector<Cluster> & Clusters, | |
146 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); | |
147 | |
148 void flushAlignments | |
149 (vector<Alignment> & Alignments, | |
150 const FastaRecord * Af, const FastaRecord * Bf, | |
151 FILE * DeltaFile); | |
152 | |
153 void flushSyntenys | |
154 (vector<Synteny> & Syntenys, FILE * ClusterFile); | |
155 | |
156 vector<Cluster>::iterator getForwardTargetCluster | |
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, | |
158 long int & targetA, long int & targetB); | |
159 | |
160 vector<Alignment>::iterator getReverseTargetAlignment | |
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp); | |
162 | |
163 bool isShadowedCluster | |
164 (vector<Cluster>::iterator CurrCp, | |
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap); | |
166 | |
167 void parseAbort | |
168 (const char *); | |
169 | |
170 void parseDelta | |
171 (vector<Alignment> & Alignments, | |
172 const FastaRecord * Af, const FastaRecord *Bf); | |
173 | |
174 void processSyntenys | |
175 (vector<Synteny> & Syntenys, | |
176 FastaRecord * Af, long int As, | |
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); | |
178 | |
179 inline long int revC | |
180 (long int Coord, long int Len); | |
181 | |
182 void printHelp | |
183 (const char *); | |
184 | |
185 void printUsage | |
186 (const char *); | |
187 | |
188 void validateData | |
189 (vector<Alignment> Alignments, vector<Cluster> Clusters, | |
190 const FastaRecord * Af, const FastaRecord * Bf); | |
191 | |
192 | |
193 | |
194 | |
195 //-------------------------------------------------- Function Definitions ----// | |
196 int main | |
197 (int argc, char ** argv) | |
198 { | |
199 FastaRecord * Af; // array of all the reference sequences | |
200 | |
201 vector<Synteny> Syntenys; // vector of all sets of clusters | |
202 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters | |
203 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer | |
204 | |
205 Synteny Asyn; // a single set of clusters | |
206 Cluster Aclu; // a single cluster of matches | |
207 Match Amat; // a single match | |
208 | |
209 LineType PrevLine; // the current input line | |
210 | |
211 bool Found; // temporary search flag | |
212 | |
213 char Line [MAX_LINE]; // a single line of input | |
214 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers | |
215 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names | |
216 char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; | |
217 | |
218 signed char DirB = FORWARD_CHAR; // the current query strand direction | |
219 | |
220 long int i; // temporary counter | |
221 long int Seqi; // current reference sequence index | |
222 long int InitSize; // initial sequence memory space | |
223 long int As = 0; // the size of the reference array | |
224 long int Ac = 100; // the capacity of the reference array | |
225 long int sA, sB, len; // current match start in A, B and length | |
226 | |
227 FILE * RefFile, * QryFile; // reference and query input files | |
228 FILE * ClusterFile, * DeltaFile; // cluster and delta output files | |
229 | |
230 //-- Set the alignment data type and break length (sw_align.h) | |
231 setMatrixType ( NUCLEOTIDE ); | |
232 setBreakLen ( 200 ); | |
233 setBanding ( 0 ); | |
234 | |
235 //-- Parse the command line arguments | |
236 { | |
237 optarg = NULL; | |
238 int ch, errflg = 0; | |
239 while ( !errflg && ((ch = getopt (argc, argv, "dehB:b:st")) != EOF) ) | |
240 switch (ch) | |
241 { | |
242 case 'b' : | |
243 setBreakLen( atoi (optarg) ); | |
244 break; | |
245 | |
246 case 'B' : | |
247 setBanding( atoi (optarg) ); | |
248 break; | |
249 | |
250 case 'd' : | |
251 DO_DELTA = false; | |
252 break; | |
253 | |
254 case 'e' : | |
255 DO_EXTEND = false; | |
256 break; | |
257 | |
258 case 'h' : | |
259 printHelp (argv[0]); | |
260 exit (EXIT_SUCCESS); | |
261 break; | |
262 | |
263 case 's' : | |
264 DO_SHADOWS = true; | |
265 break; | |
266 | |
267 case 't' : | |
268 TO_SEQEND = true; | |
269 break; | |
270 | |
271 default : | |
272 errflg ++; | |
273 } | |
274 | |
275 if ( errflg > 0 || argc - optind != 3 ) | |
276 { | |
277 printUsage (argv[0]); | |
278 exit (EXIT_FAILURE); | |
279 } | |
280 } | |
281 | |
282 //-- Read and create the I/O file names | |
283 strcpy (RefFileName, argv[optind ++]); | |
284 strcpy (QryFileName, argv[optind ++]); | |
285 strcpy (ClusterFileName, argv[optind ++]); | |
286 strcpy (DeltaFileName, ClusterFileName); | |
287 strcat (ClusterFileName, ".cluster"); | |
288 strcat (DeltaFileName, ".delta"); | |
289 | |
290 //-- Open all the files | |
291 RefFile = File_Open (RefFileName, "r"); | |
292 QryFile = File_Open (QryFileName, "r"); | |
293 if ( DO_DELTA ) | |
294 { | |
295 ClusterFile = NULL; | |
296 DeltaFile = File_Open (DeltaFileName, "w"); | |
297 } | |
298 else | |
299 { | |
300 ClusterFile = File_Open (ClusterFileName, "w"); | |
301 DeltaFile = NULL; | |
302 } | |
303 | |
304 //-- Print the headers of the output files | |
305 if ( DO_DELTA ) | |
306 fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); | |
307 else | |
308 fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); | |
309 | |
310 //-- Generate the array of the reference sequences | |
311 InitSize = INIT_SIZE; | |
312 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); | |
313 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
314 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) | |
315 { | |
316 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); | |
317 strcpy (Af[As].Id, IdA); | |
318 | |
319 Af[As].len = strlen (Af[As].seq + 1); | |
320 | |
321 if ( ++ As >= Ac ) | |
322 { | |
323 Ac *= 2; | |
324 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); | |
325 } | |
326 | |
327 InitSize = INIT_SIZE; | |
328 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
329 } | |
330 fclose (RefFile); | |
331 if ( As <= 0 ) | |
332 parseAbort ( RefFileName ); | |
333 | |
334 | |
335 | |
336 //-- Process the input from <stdin> line by line | |
337 PrevLine = NO_LINE; | |
338 IdA[0] = '\0'; | |
339 IdB[0] = '\0'; | |
340 CurrIdB[0] = '\0'; | |
341 while ( fgets(Line, MAX_LINE, stdin) != NULL ) | |
342 { | |
343 | |
344 //-- If the current line is a fasta HEADER_LINE | |
345 if ( Line[0] == '>' ) | |
346 { | |
347 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) | |
348 parseAbort ("stdin"); | |
349 DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR; | |
350 | |
351 PrevLine = HEADER_LINE; | |
352 } | |
353 | |
354 | |
355 //-- If the current line is a cluster HEADER_LINE | |
356 else if ( Line[0] == '#' ) | |
357 { | |
358 PrevLine = HEADER_LINE; | |
359 } | |
360 | |
361 | |
362 //-- If the current line is a MATCH_LINE | |
363 else | |
364 { | |
365 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) | |
366 parseAbort ("stdin"); | |
367 | |
368 //-- Re-map the reference coordinate back to its original sequence | |
369 for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ ) | |
370 sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq | |
371 if ( Seqi >= As ) | |
372 { | |
373 fprintf (stderr, | |
374 "ERROR: A MUM was found with a start coordinate greater than\n" | |
375 " the sequence length, a serious error has occured.\n" | |
376 " Please file a bug report\n"); | |
377 exit (EXIT_FAILURE); | |
378 } | |
379 | |
380 //-- If the match spans across a sequence boundry | |
381 if ( sA + len - 1 > Af[Seqi].len || sA <= 0) | |
382 { | |
383 fprintf (stderr, | |
384 "WARNING: A MUM was found extending beyond the boundry of:\n" | |
385 " Reference sequence '>%s'\n\n" | |
386 "Please check that the '-n' option is activated on 'mummer2'\n" | |
387 "and try again, or file a bug report\n" | |
388 "Attempting to continue.\n", Af[Seqi].Id); | |
389 continue; | |
390 } | |
391 | |
392 //-- Check and update the current synteny region | |
393 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ) | |
394 { | |
395 Found = false; | |
396 if ( strcmp (IdB, CurrIdB) == 0 ) | |
397 { | |
398 //-- Has this header been seen before? | |
399 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) | |
400 { | |
401 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) | |
402 { | |
403 if ( Sp->AfP->len != Af[Seqi].len ) | |
404 { | |
405 fprintf (stderr, | |
406 "ERROR: The reference file may contain" | |
407 " sequences with non-unique\n" | |
408 " header Ids, please check your input" | |
409 " files and try again\n"); | |
410 exit (EXIT_FAILURE); | |
411 } | |
412 assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); | |
413 CurrSp = Sp; | |
414 Found = true; | |
415 break; | |
416 } | |
417 } | |
418 } | |
419 else | |
420 { | |
421 //-- New B sequence header, process all the old synteny's | |
422 processSyntenys (Syntenys, Af, As, | |
423 QryFile, ClusterFile, DeltaFile); | |
424 } | |
425 | |
426 strcpy (IdA, Af[Seqi].Id); | |
427 strcpy (IdB, CurrIdB); | |
428 | |
429 //-- If not seen yet, create a new synteny region | |
430 if ( ! Found ) | |
431 { | |
432 Asyn.AfP = Af + Seqi; | |
433 Asyn.Bf.len = -1; | |
434 Asyn.Bf.Id = (char *) Safe_malloc | |
435 (sizeof(char) * (strlen(IdB) + 1)); | |
436 strcpy (Asyn.Bf.Id, IdB); | |
437 | |
438 Syntenys.push_back (Asyn); | |
439 CurrSp = Syntenys.rbegin( ); | |
440 } | |
441 | |
442 //-- Add a new cluster to the current synteny | |
443 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
444 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
445 CurrSp->clusters.pop_back( ); // hack to remove empties | |
446 Aclu.wasFused = false; | |
447 Aclu.dirB = DirB; | |
448 CurrSp->clusters.push_back (Aclu); | |
449 } | |
450 else if ( PrevLine == HEADER_LINE ) | |
451 { | |
452 //-- Add a new cluster to the current synteny | |
453 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
454 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
455 CurrSp->clusters.pop_back( ); | |
456 Aclu.wasFused = false; | |
457 Aclu.dirB = DirB; | |
458 CurrSp->clusters.push_back (Aclu); | |
459 } | |
460 | |
461 //-- Add a new match to the current cluster | |
462 if ( len > 1 ) | |
463 { | |
464 Amat.sA = sA; | |
465 Amat.sB = sB; | |
466 Amat.len = len; | |
467 CurrSp->clusters.rbegin( )->matches.push_back (Amat); | |
468 } | |
469 | |
470 PrevLine = MATCH_LINE; | |
471 } | |
472 } | |
473 | |
474 //-- Process the left-over syntenys | |
475 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) | |
476 if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) | |
477 CurrSp->clusters.pop_back( ); | |
478 | |
479 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); | |
480 fclose (QryFile); | |
481 | |
482 //-- Free the reference sequences | |
483 for ( i = 0; i < As; i ++ ) | |
484 { | |
485 free (Af[i].Id); | |
486 free (Af[i].seq); | |
487 } | |
488 free (Af); | |
489 | |
490 return EXIT_SUCCESS; | |
491 } | |
492 | |
493 | |
494 | |
495 | |
496 void addNewAlignment | |
497 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp, | |
498 vector<Match>::iterator Mp) | |
499 | |
500 // Create and add a new alignment object based on the current match | |
501 // and cluster information pointed to by Cp and Mp. | |
502 | |
503 { | |
504 Alignment Align; | |
505 | |
506 //-- Initialize the data | |
507 Align.sA = Mp->sA; | |
508 Align.sB = Mp->sB; | |
509 Align.eA = Mp->sA + Mp->len - 1; | |
510 Align.eB = Mp->sB + Mp->len - 1; | |
511 Align.dirB = Cp->dirB; | |
512 Align.delta.clear( ); | |
513 Align.deltaApos = 0; | |
514 | |
515 //-- Add the alignment object | |
516 Alignments.push_back (Align); | |
517 | |
518 return; | |
519 } | |
520 | |
521 | |
522 | |
523 | |
524 bool extendBackward | |
525 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp, | |
526 vector<Alignment>::iterator TargetAp, const char * A, const char * B) | |
527 | |
528 // Extend an alignment backwards off of the current alignment object. | |
529 // The current alignment object must be freshly created and consist | |
530 // only of an exact match (i.e. the delta vector MUST be empty). | |
531 // If the TargetAp alignment object is reached by the extension, it will | |
532 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is | |
533 // NULL the function will extend as far as possible. It is a strange | |
534 // and dangerous function because it can delete CurrAp, so edit with | |
535 // caution. Returns true if TargetAp was reached and merged, else false | |
536 // Designed only as a subroutine for extendClusters, should be used | |
537 // nowhere else. | |
538 | |
539 { | |
540 bool target_reached = false; | |
541 bool overflow_flag = false; | |
542 bool double_flag = false; | |
543 | |
544 vector<long int>::iterator Dp; | |
545 | |
546 unsigned int m_o; | |
547 long int targetA, targetB; | |
548 | |
549 m_o = BACKWARD_SEARCH; | |
550 | |
551 //-- Set the target coordinates | |
552 if ( TargetAp != Alignments.end( ) ) | |
553 { | |
554 targetA = TargetAp->eA; | |
555 targetB = TargetAp->eB; | |
556 } | |
557 else | |
558 { | |
559 targetA = 1; | |
560 targetB = 1; | |
561 m_o |= OPTIMAL_BIT; | |
562 } | |
563 | |
564 //-- If alignment is too long, bring with bounds and set overflow_flag true | |
565 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) | |
566 { | |
567 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; | |
568 overflow_flag = true; | |
569 m_o |= OPTIMAL_BIT; | |
570 } | |
571 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) | |
572 { | |
573 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; | |
574 if ( overflow_flag ) | |
575 double_flag = true; | |
576 else | |
577 overflow_flag = true; | |
578 m_o |= OPTIMAL_BIT; | |
579 } | |
580 | |
581 | |
582 if ( TO_SEQEND && !double_flag ) | |
583 m_o |= SEQEND_BIT; | |
584 | |
585 | |
586 //-- Attempt to reach the target | |
587 target_reached = alignSearch (A, CurrAp->sA, targetA, | |
588 B, CurrAp->sB, targetB, m_o); | |
589 | |
590 if ( overflow_flag || TargetAp == Alignments.end( ) ) | |
591 target_reached = false; | |
592 | |
593 if ( target_reached ) | |
594 { | |
595 //-- Merge the two alignment objects | |
596 extendForward (TargetAp, A, CurrAp->sA, | |
597 B, CurrAp->sB, FORCED_FORWARD_ALIGN); | |
598 TargetAp->eA = CurrAp->eA; | |
599 TargetAp->eB = CurrAp->eB; | |
600 Alignments.pop_back( ); | |
601 } | |
602 else | |
603 { | |
604 alignTarget (A, targetA, CurrAp->sA, | |
605 B, targetB, CurrAp->sB, | |
606 CurrAp->delta, FORCED_FORWARD_ALIGN); | |
607 CurrAp->sA = targetA; | |
608 CurrAp->sB = targetB; | |
609 | |
610 //-- Update the deltaApos value for the alignment object | |
611 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) | |
612 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; | |
613 } | |
614 | |
615 return target_reached; | |
616 } | |
617 | |
618 | |
619 | |
620 | |
621 bool extendForward | |
622 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA, | |
623 const char * B, long int targetB, unsigned int m_o) | |
624 | |
625 // Extend an alignment forwards off the current alignment object until | |
626 // target or end of sequence is reached, and merge the delta values of the | |
627 // alignment object with the new delta values generated by the extension. | |
628 // Return true if the target was reached, else false | |
629 | |
630 { | |
631 long int ValA; | |
632 long int ValB; | |
633 unsigned int Di; | |
634 bool target_reached; | |
635 bool overflow_flag = false; | |
636 bool double_flag = false; | |
637 vector<long int>::iterator Dp; | |
638 | |
639 //-- Set Di to the end of the delta vector | |
640 Di = CurrAp->delta.size( ); | |
641 | |
642 //-- Calculate the distance between the start and end positions | |
643 ValA = targetA - CurrAp->eA + 1; | |
644 ValB = targetB - CurrAp->eB + 1; | |
645 | |
646 //-- If the distance is too long, shrink it and set the overflow_flag | |
647 if ( ValA > MAX_ALIGNMENT_LENGTH ) | |
648 { | |
649 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; | |
650 overflow_flag = true; | |
651 m_o |= OPTIMAL_BIT; | |
652 } | |
653 if ( ValB > MAX_ALIGNMENT_LENGTH ) | |
654 { | |
655 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; | |
656 if ( overflow_flag ) | |
657 double_flag = true; | |
658 else | |
659 overflow_flag = true; | |
660 m_o |= OPTIMAL_BIT; | |
661 } | |
662 | |
663 if ( double_flag ) | |
664 m_o &= ~SEQEND_BIT; | |
665 | |
666 target_reached = alignTarget (A, CurrAp->eA, targetA, | |
667 B, CurrAp->eB, targetB, | |
668 CurrAp->delta, m_o); | |
669 | |
670 //-- Notify user if alignment was chopped short | |
671 if ( target_reached && overflow_flag ) | |
672 target_reached = false; | |
673 | |
674 //-- Pick up delta where left off (Di) and merge with new deltas | |
675 if ( Di < CurrAp->delta.size( ) ) | |
676 { | |
677 //-- Merge the deltas | |
678 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; | |
679 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); | |
680 if ( CurrAp->delta[Di] == 0 || ValA < 0 ) | |
681 { | |
682 fprintf(stderr, | |
683 "ERROR: failed to merge alignments at position %ld\n" | |
684 " Please file a bug report\n", | |
685 CurrAp->eA); | |
686 exit (EXIT_FAILURE); | |
687 } | |
688 | |
689 //-- Update the deltaApos | |
690 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) | |
691 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; | |
692 } | |
693 | |
694 //-- Set the alignment coordinates | |
695 CurrAp->eA = targetA; | |
696 CurrAp->eB = targetB; | |
697 | |
698 return target_reached; | |
699 } | |
700 | |
701 | |
702 | |
703 | |
704 void extendClusters | |
705 (vector<Cluster> & Clusters, | |
706 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) | |
707 | |
708 // Connect all the matches in every cluster between sequences A and B. | |
709 // Also, extend alignments off of the front and back of each cluster to | |
710 // expand total alignment coverage. When these extensions encounter an | |
711 // adjacent cluster, fuse the two regions to create one single | |
712 // encompassing region. This routine will create alignment objects from | |
713 // these extensions and output the resulting delta information to the | |
714 // delta output file. | |
715 | |
716 { | |
717 //-- Sort the clusters (ascending) by their start coordinate in sequence A | |
718 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); | |
719 | |
720 | |
721 //-- If no delta file is requested | |
722 if ( ! DO_DELTA ) | |
723 return; | |
724 | |
725 | |
726 bool target_reached = false; // reached the adjacent match or cluster | |
727 | |
728 char * A, * B; // the sequences A and B | |
729 char * Brev = NULL; // the reverse complement of B | |
730 | |
731 unsigned int m_o; | |
732 long int targetA, targetB; // alignment extension targets in A and B | |
733 | |
734 vector<Match>::iterator Mp; // match pointer | |
735 | |
736 vector<Cluster>::iterator PrevCp; // where the extensions last left off | |
737 vector<Cluster>::iterator CurrCp; // the current cluster being extended | |
738 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster | |
739 | |
740 vector<Alignment> Alignments; // the vector of alignment objects | |
741 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align | |
742 vector<Alignment>::iterator TargetAp; // target align | |
743 | |
744 | |
745 //-- Extend each cluster | |
746 A = Af->seq; | |
747 PrevCp = Clusters.begin( ); | |
748 CurrCp = Clusters.begin( ); | |
749 while ( CurrCp < Clusters.end( ) ) | |
750 { | |
751 if ( DO_EXTEND ) | |
752 { | |
753 //-- Ignore if shadowed or already extended | |
754 if ( ! target_reached ) | |
755 if ( CurrCp->wasFused | |
756 || | |
757 (!DO_SHADOWS && | |
758 isShadowedCluster (CurrCp, Alignments, CurrAp)) ) | |
759 { | |
760 CurrCp->wasFused = true; | |
761 CurrCp = ++ PrevCp; | |
762 continue; | |
763 } | |
764 } | |
765 | |
766 //-- Pick the right directional sequence for B | |
767 if ( CurrCp->dirB == FORWARD_CHAR ) | |
768 B = Bf->seq; | |
769 else if ( Brev != NULL ) | |
770 B = Brev; | |
771 else | |
772 { | |
773 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); | |
774 strcpy ( Brev + 1, Bf->seq + 1 ); | |
775 Brev[0] = '\0'; | |
776 Reverse_Complement (Brev, 1, Bf->len); | |
777 B = Brev; | |
778 } | |
779 | |
780 //-- Extend each match in the cluster | |
781 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) | |
782 { | |
783 //-- Try to extend the current match backwards | |
784 if ( target_reached ) | |
785 { | |
786 //-- Merge with the previous match | |
787 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) | |
788 { | |
789 if ( Mp >= CurrCp->matches.end( ) - 1 ) | |
790 { | |
791 fprintf (stderr, | |
792 "ERROR: Target match does not exist, please\n" | |
793 " file a bug report\n"); | |
794 exit (EXIT_FAILURE); | |
795 } | |
796 continue; | |
797 } | |
798 CurrAp->eA += Mp->len - 1; | |
799 CurrAp->eB += Mp->len - 1; | |
800 } | |
801 else | |
802 { | |
803 //-- Create a new alignment object | |
804 addNewAlignment (Alignments, CurrCp, Mp); | |
805 CurrAp = Alignments.end( ) - 1; | |
806 | |
807 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) | |
808 { | |
809 //-- Target the closest/best alignment object | |
810 TargetAp = getReverseTargetAlignment (Alignments, CurrAp); | |
811 | |
812 //-- Extend the new alignment object backwards | |
813 if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) ) | |
814 CurrAp = TargetAp; | |
815 } | |
816 } | |
817 | |
818 m_o = FORWARD_ALIGN; | |
819 | |
820 //-- Try to extend the current match forwards | |
821 if ( Mp < CurrCp->matches.end( ) - 1 ) | |
822 { | |
823 //-- Target the next match in the cluster | |
824 targetA = (Mp + 1)->sA; | |
825 targetB = (Mp + 1)->sB; | |
826 | |
827 //-- Extend the current alignment object forward | |
828 target_reached = extendForward (CurrAp, A, targetA, | |
829 B, targetB, m_o); | |
830 } | |
831 else if ( DO_EXTEND ) | |
832 { | |
833 targetA = Af->len; | |
834 targetB = Bf->len; | |
835 | |
836 //-- Target the closest/best match in a future cluster | |
837 TargetCp = getForwardTargetCluster (Clusters, CurrCp, | |
838 targetA, targetB); | |
839 if ( TargetCp == Clusters.end( ) ) | |
840 { | |
841 m_o |= OPTIMAL_BIT; | |
842 if ( TO_SEQEND ) | |
843 m_o |= SEQEND_BIT; | |
844 } | |
845 | |
846 //-- Extend the current alignment object forward | |
847 target_reached = extendForward (CurrAp, A, targetA, | |
848 B, targetB, m_o); | |
849 } | |
850 } | |
851 if ( TargetCp == Clusters.end( ) ) | |
852 target_reached = false; | |
853 | |
854 CurrCp->wasFused = true; | |
855 | |
856 if ( target_reached == false ) | |
857 CurrCp = ++ PrevCp; | |
858 else | |
859 CurrCp = TargetCp; | |
860 } | |
861 | |
862 #ifdef _DEBUG_ASSERT | |
863 validateData (Alignments, Clusters, Af, Bf); | |
864 #endif | |
865 | |
866 //-- Output the alignment data to the delta file | |
867 flushAlignments (Alignments, Af, Bf, DeltaFile); | |
868 | |
869 if ( Brev != NULL ) | |
870 free (Brev); | |
871 | |
872 return; | |
873 } | |
874 | |
875 | |
876 | |
877 | |
878 void flushAlignments | |
879 (vector<Alignment> & Alignments, | |
880 const FastaRecord * Af, const FastaRecord * Bf, | |
881 FILE * DeltaFile) | |
882 | |
883 // Simply output the delta information stored in Alignments to the | |
884 // given delta file. Free the memory used by Alignments once the | |
885 // data is successfully output to the file. | |
886 | |
887 { | |
888 vector<Alignment>::iterator Ap; // alignment pointer | |
889 vector<long int>::iterator Dp; // delta pointer | |
890 | |
891 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); | |
892 | |
893 //-- Generate the error counts | |
894 parseDelta (Alignments, Af, Bf); | |
895 | |
896 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
897 { | |
898 if ( Ap->dirB == FORWARD_CHAR ) | |
899 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", | |
900 Ap->sA, Ap->eA, | |
901 Ap->sB, Ap->eB, | |
902 Ap->Errors, Ap->SimErrors, Ap->NonAlphas); | |
903 else | |
904 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", | |
905 Ap->sA, Ap->eA, | |
906 revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len), | |
907 Ap->Errors, Ap->SimErrors, Ap->NonAlphas); | |
908 | |
909 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) | |
910 fprintf (DeltaFile, "%ld\n", *Dp); | |
911 fprintf (DeltaFile, "0\n"); | |
912 | |
913 Ap->delta.clear( ); | |
914 } | |
915 | |
916 Alignments.clear( ); | |
917 | |
918 return; | |
919 } | |
920 | |
921 | |
922 | |
923 | |
924 void flushSyntenys | |
925 (vector<Synteny> & Syntenys, FILE * ClusterFile) | |
926 | |
927 // Simply output the synteny/cluster information generated by the mgaps | |
928 // program. However, now the coordinates reference their appropriate | |
929 // reference sequence, and the reference sequecne header is added to | |
930 // the appropriate lines. Free the memory used by Syntenys once the | |
931 // data is successfully output to the file. | |
932 | |
933 { | |
934 vector<Synteny>::iterator Sp; // synteny pointer | |
935 vector<Cluster>::iterator Cp; // cluster pointer | |
936 vector<Match>::iterator Mp; // match pointer | |
937 | |
938 if ( ClusterFile ) | |
939 { | |
940 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) | |
941 { | |
942 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, | |
943 Sp->AfP->len, Sp->Bf.len); | |
944 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) | |
945 { | |
946 fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB); | |
947 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) | |
948 { | |
949 if ( Cp->dirB == FORWARD_CHAR ) | |
950 fprintf (ClusterFile, "%8ld %8ld %6ld", | |
951 Mp->sA, Mp->sB, Mp->len); | |
952 else | |
953 fprintf (ClusterFile, "%8ld %8ld %6ld", | |
954 Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len); | |
955 if ( Mp != Cp->matches.begin( ) ) | |
956 fprintf (ClusterFile, "%6ld %6ld\n", | |
957 Mp->sA - (Mp - 1)->sA - (Mp - 1)->len, | |
958 Mp->sB - (Mp - 1)->sB - (Mp - 1)->len); | |
959 else | |
960 fprintf (ClusterFile, "%6s %6s\n", "-", "-"); | |
961 } | |
962 Cp->matches.clear( ); | |
963 } | |
964 Sp->clusters.clear( ); | |
965 } | |
966 } | |
967 else | |
968 { | |
969 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) | |
970 { | |
971 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) | |
972 { | |
973 Cp->matches.clear( ); | |
974 } | |
975 Sp->clusters.clear( ); | |
976 } | |
977 } | |
978 | |
979 Syntenys.clear( ); | |
980 | |
981 return; | |
982 } | |
983 | |
984 | |
985 | |
986 | |
987 vector<Cluster>::iterator getForwardTargetCluster | |
988 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp, | |
989 long int & targetA, long int & targetB) | |
990 | |
991 // Return the cluster that is most likely to successfully join (in a | |
992 // forward direction) with the current cluster. The returned cluster | |
993 // must contain 1 or more matches that are strictly greater than the end | |
994 // of the current cluster. The targeted cluster must also be on a | |
995 // diagonal close enough to the current cluster, so that a connection | |
996 // could possibly be made by the alignment extender. Assumes clusters | |
997 // have been sorted via AscendingClusterSort. Returns targeted cluster | |
998 // and stores the target coordinates in targetA and targetB. If no | |
999 // suitable cluster was found, the function will return NULL and target | |
1000 // A and targetB will remain unchanged. | |
1001 | |
1002 { | |
1003 vector<Match>::iterator Mip; // match iteratrive pointer | |
1004 vector<Cluster>::iterator Cp; // cluster pointer | |
1005 vector<Cluster>::iterator Cip; // cluster iterative pointer | |
1006 long int eA, eB; // possible target | |
1007 long int greater, lesser; // gap sizes between two clusters | |
1008 long int sA = CurrCp->matches.rbegin( )->sA + | |
1009 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster | |
1010 long int sB = CurrCp->matches.rbegin( )->sB + | |
1011 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster | |
1012 | |
1013 //-- End of sequences is the default target, set distance accordingly | |
1014 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); | |
1015 | |
1016 //-- For all clusters greater than the current cluster (on sequence A) | |
1017 Cp = Clusters.end( ); | |
1018 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) | |
1019 { | |
1020 //-- If the cluster is on the same direction | |
1021 if ( CurrCp->dirB == Cip->dirB ) | |
1022 { | |
1023 eA = Cip->matches.begin( )->sA; | |
1024 eB = Cip->matches.begin( )->sB; | |
1025 | |
1026 //-- If the cluster overlaps the current cluster, strip some matches | |
1027 if ( ( eA < sA || eB < sB ) && | |
1028 Cip->matches.rbegin( )->sA >= sA && | |
1029 Cip->matches.rbegin( )->sB >= sB ) | |
1030 { | |
1031 for ( Mip = Cip->matches.begin( ); | |
1032 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); | |
1033 Mip ++ ) | |
1034 { | |
1035 eA = Mip->sA; | |
1036 eB = Mip->sB; | |
1037 } | |
1038 } | |
1039 | |
1040 //-- If the cluster is strictly greater than current cluster | |
1041 if ( eA >= sA && eB >= sB ) | |
1042 { | |
1043 if ( eA - sA > eB - sB ) | |
1044 { | |
1045 greater = eA - sA; | |
1046 lesser = eB - sB; | |
1047 } | |
1048 else | |
1049 { | |
1050 lesser = eA - sA; | |
1051 greater = eB - sB; | |
1052 } | |
1053 | |
1054 //-- If the cluster is close enough | |
1055 if ( greater < getBreakLen( ) || | |
1056 (lesser) * GOOD_SCORE [getMatrixType( )] + | |
1057 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) | |
1058 { | |
1059 Cp = Cip; | |
1060 targetA = eA; | |
1061 targetB = eB; | |
1062 break; | |
1063 } | |
1064 else if ( (greater << 1) - lesser < dist ) | |
1065 { | |
1066 Cp = Cip; | |
1067 targetA = eA; | |
1068 targetB = eB; | |
1069 dist = (greater << 1) - lesser; | |
1070 } | |
1071 } | |
1072 } | |
1073 } | |
1074 | |
1075 | |
1076 return Cp; | |
1077 } | |
1078 | |
1079 | |
1080 | |
1081 | |
1082 vector<Alignment>::iterator getReverseTargetAlignment | |
1083 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp) | |
1084 | |
1085 // Return the alignment that is most likely to successfully join (in a | |
1086 // reverse direction) with the current alignment. The returned alignment | |
1087 // must be strictly less than the current cluster and be on a diagonal | |
1088 // close enough to the current alignment, so that a connection | |
1089 // could possibly be made by the alignment extender. Assumes clusters | |
1090 // have been sorted via AscendingClusterSort and processed in order, so | |
1091 // therefore all alignments are in order by their start A coordinate. | |
1092 | |
1093 { | |
1094 vector<Alignment>::iterator Ap; // alignment pointer | |
1095 vector<Alignment>::iterator Aip; // alignment iterative pointer | |
1096 long int eA, eB; // possible targets | |
1097 long int greater, lesser; // gap sizes between the two alignments | |
1098 long int sA = CurrAp->sA; // the startA of the current alignment | |
1099 long int sB = CurrAp->sB; // the startB of the current alignment | |
1100 | |
1101 //-- Beginning of sequences is the default target, set distance accordingly | |
1102 long int dist = (sA < sB ? sA : sB); | |
1103 | |
1104 //-- For all alignments less than the current alignment (on sequence A) | |
1105 Ap = Alignments.end( ); | |
1106 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) | |
1107 { | |
1108 //-- If the alignment is on the same direction | |
1109 if ( CurrAp->dirB == Aip->dirB ) | |
1110 { | |
1111 eA = Aip->eA; | |
1112 eB = Aip->eB; | |
1113 | |
1114 //-- If the alignment is strictly less than current cluster | |
1115 if ( eA <= sA && eB <= sB ) | |
1116 { | |
1117 if ( sA - eA > sB - eB ) | |
1118 { | |
1119 greater = sA - eA; | |
1120 lesser = sB - eB; | |
1121 } | |
1122 else | |
1123 { | |
1124 lesser = sA - eA; | |
1125 greater = sB - eB; | |
1126 } | |
1127 | |
1128 //-- If the cluster is close enough | |
1129 if ( greater < getBreakLen( ) || | |
1130 (lesser) * GOOD_SCORE [getMatrixType( )] + | |
1131 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) | |
1132 { | |
1133 Ap = Aip; | |
1134 break; | |
1135 } | |
1136 else if ( (greater << 1) - lesser < dist ) | |
1137 { | |
1138 Ap = Aip; | |
1139 dist = (greater << 1) - lesser; | |
1140 } | |
1141 } | |
1142 } | |
1143 } | |
1144 | |
1145 | |
1146 return Ap; | |
1147 } | |
1148 | |
1149 | |
1150 | |
1151 | |
1152 bool isShadowedCluster | |
1153 (vector<Cluster>::iterator CurrCp, | |
1154 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap) | |
1155 | |
1156 // Check if the current cluster is shadowed by a previously produced | |
1157 // alignment region. Return true if it is, else false. | |
1158 | |
1159 { | |
1160 vector<Alignment>::iterator Aip; // alignment pointer | |
1161 | |
1162 long int sA = CurrCp->matches.begin( )->sA; | |
1163 long int eA = CurrCp->matches.rbegin( )->sA + | |
1164 CurrCp->matches.rbegin( )->len - 1; | |
1165 long int sB = CurrCp->matches.begin( )->sB; | |
1166 long int eB = CurrCp->matches.rbegin( )->sB + | |
1167 CurrCp->matches.rbegin( )->len - 1; | |
1168 | |
1169 if ( ! Alignments.empty( ) ) // if there are alignments to use | |
1170 { | |
1171 //-- Look backwards in hope of finding a shadowing alignment | |
1172 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) | |
1173 { | |
1174 //-- If in the same direction and shadowing the current cluster, break | |
1175 if ( Aip->dirB == CurrCp->dirB ) | |
1176 if ( Aip->eA >= eA && Aip->eB >= eB ) | |
1177 if ( Aip->sA <= sA && Aip->sB <= sB ) | |
1178 break; | |
1179 } | |
1180 | |
1181 //-- Return true if the loop was broken, i.e. shadow found | |
1182 if ( Aip >= Alignments.begin( ) ) | |
1183 return true; | |
1184 } | |
1185 | |
1186 //-- Return false if Alignments was empty or loop was not broken | |
1187 return false; | |
1188 } | |
1189 | |
1190 | |
1191 | |
1192 | |
1193 void parseAbort | |
1194 (const char * s) | |
1195 | |
1196 // Abort the program if there was an error in parsing file 's' | |
1197 | |
1198 { | |
1199 fprintf (stderr, | |
1200 "ERROR: Could not parse input from '%s'. \n" | |
1201 "Please check the filename and format, or file a bug report\n", s); | |
1202 exit (EXIT_FAILURE); | |
1203 } | |
1204 | |
1205 | |
1206 | |
1207 | |
1208 void parseDelta | |
1209 (vector<Alignment> & Alignments, | |
1210 const FastaRecord * Af, const FastaRecord *Bf) | |
1211 | |
1212 // Use the delta information to generate the error counts for each | |
1213 // alignment, and fill this information into the data type | |
1214 | |
1215 { | |
1216 char * A, * B, * Brev = NULL; | |
1217 char ch1, ch2; | |
1218 long int Delta; | |
1219 int Sign; | |
1220 long int i, Apos, Bpos; | |
1221 long int Remain, Total; | |
1222 long int Errors, SimErrors; | |
1223 long int NonAlphas; | |
1224 vector<Alignment>::iterator Ap; | |
1225 vector<long int>::iterator Dp; | |
1226 | |
1227 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1228 { | |
1229 A = Af->seq; | |
1230 B = Bf->seq; | |
1231 | |
1232 if ( Ap->dirB == REVERSE_CHAR ) | |
1233 { | |
1234 if ( Brev == NULL ) | |
1235 { | |
1236 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); | |
1237 strcpy ( Brev + 1, Bf->seq + 1 ); | |
1238 Brev[0] = '\0'; | |
1239 Reverse_Complement (Brev, 1, Bf->len); | |
1240 B = Brev; | |
1241 } | |
1242 B = Brev; | |
1243 } | |
1244 | |
1245 Apos = Ap->sA; | |
1246 Bpos = Ap->sB; | |
1247 | |
1248 Errors = 0; | |
1249 SimErrors = 0; | |
1250 NonAlphas = 0; | |
1251 Remain = Ap->eA - Ap->sA + 1; | |
1252 Total = Remain; | |
1253 | |
1254 //-- For all delta's in this alignment | |
1255 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) | |
1256 { | |
1257 Delta = *Dp; | |
1258 Sign = Delta > 0 ? 1 : -1; | |
1259 Delta = labs ( Delta ); | |
1260 | |
1261 //-- For all the bases before the next indel | |
1262 for ( i = 1; i < Delta; i ++ ) | |
1263 { | |
1264 ch1 = A [Apos ++]; | |
1265 ch2 = B [Bpos ++]; | |
1266 | |
1267 if ( !isalpha (ch1) ) | |
1268 { | |
1269 ch1 = STOP_CHAR; | |
1270 NonAlphas ++; | |
1271 } | |
1272 if ( !isalpha (ch2) ) | |
1273 { | |
1274 ch2 = STOP_CHAR; | |
1275 NonAlphas ++; | |
1276 } | |
1277 | |
1278 ch1 = toupper(ch1); | |
1279 ch2 = toupper(ch2); | |
1280 if ( 1 > MATCH_SCORE | |
1281 [getMatrixType( )] | |
1282 [ch1 - 'A'] | |
1283 [ch2 - 'A'] ) | |
1284 SimErrors ++; | |
1285 if ( ch1 != ch2 ) | |
1286 Errors ++; | |
1287 } | |
1288 | |
1289 //-- Process the current indel | |
1290 Remain -= i - 1; | |
1291 Errors ++; | |
1292 SimErrors ++; | |
1293 | |
1294 if ( Sign == 1 ) | |
1295 { | |
1296 Apos ++; | |
1297 Remain --; | |
1298 } | |
1299 else | |
1300 { | |
1301 Bpos ++; | |
1302 Total ++; | |
1303 } | |
1304 } | |
1305 | |
1306 //-- For all the bases after the final indel | |
1307 for ( i = 0; i < Remain; i ++ ) | |
1308 { | |
1309 //-- Score character match and update error counters | |
1310 ch1 = A [Apos ++]; | |
1311 ch2 = B [Bpos ++]; | |
1312 | |
1313 if ( !isalpha (ch1) ) | |
1314 { | |
1315 ch1 = STOP_CHAR; | |
1316 NonAlphas ++; | |
1317 } | |
1318 if ( !isalpha (ch2) ) | |
1319 { | |
1320 ch2 = STOP_CHAR; | |
1321 NonAlphas ++; | |
1322 } | |
1323 | |
1324 ch1 = toupper(ch1); | |
1325 ch2 = toupper(ch2); | |
1326 if ( 1 > MATCH_SCORE | |
1327 [getMatrixType( )] | |
1328 [ch1 - 'A'] | |
1329 [ch2 - 'A'] ) | |
1330 SimErrors ++; | |
1331 if ( ch1 != ch2 ) | |
1332 Errors ++; | |
1333 } | |
1334 | |
1335 Ap->Errors = Errors; | |
1336 Ap->SimErrors = SimErrors; | |
1337 Ap->NonAlphas = NonAlphas; | |
1338 } | |
1339 | |
1340 if ( Brev != NULL ) | |
1341 free ( Brev ); | |
1342 | |
1343 return; | |
1344 } | |
1345 | |
1346 | |
1347 | |
1348 | |
1349 void processSyntenys | |
1350 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As, | |
1351 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) | |
1352 | |
1353 // For each syntenic region with clusters, read in the B sequence and | |
1354 // extend the clusters to expand total alignment coverage. Only should | |
1355 // be called once all the clusters for the contained syntenic regions have | |
1356 // been stored in the data structure. Frees the memory used by the | |
1357 // the syntenic regions once the output of extendClusters and | |
1358 // flushSyntenys has been produced. | |
1359 | |
1360 { | |
1361 FastaRecord Bf; // the current B sequence | |
1362 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); | |
1363 | |
1364 long int InitSize = INIT_SIZE; // the initial memory capacity of B | |
1365 vector<Synteny>::iterator CurrSp; // current synteny pointer | |
1366 | |
1367 //-- Initialize the B sequence storage | |
1368 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
1369 | |
1370 //-- For all the contained syntenys | |
1371 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) | |
1372 { | |
1373 //-- If no clusters, ignore | |
1374 if ( CurrSp->clusters.empty( ) ) | |
1375 continue; | |
1376 | |
1377 //-- If a B sequence not seen yet, read it in | |
1378 //-- IMPORTANT: The B sequences in the synteny object are assumed to be | |
1379 // ordered as output by mgaps, if they are not in order the program | |
1380 // will fail. (All like tags must be adjacent and in the same order | |
1381 // as the query file) | |
1382 if ( CurrSp == Syntenys.begin( ) || | |
1383 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) | |
1384 { | |
1385 //-- Read in the B sequence | |
1386 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) | |
1387 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) | |
1388 break; | |
1389 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) | |
1390 parseAbort ( "Query File" ); | |
1391 Bf.len = strlen (Bf.seq + 1); | |
1392 } | |
1393 | |
1394 //-- Extend clusters and create the alignment information | |
1395 CurrSp->Bf.len = Bf.len; | |
1396 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); | |
1397 } | |
1398 | |
1399 //-- Create the cluster information | |
1400 flushSyntenys (Syntenys, ClusterFile); | |
1401 | |
1402 free (Bf.Id); | |
1403 free (Bf.seq); | |
1404 | |
1405 return; | |
1406 } | |
1407 | |
1408 | |
1409 | |
1410 | |
1411 inline long int revC | |
1412 (long int Coord, long int Len) | |
1413 | |
1414 // Reverse complement the given coordinate for the given length. | |
1415 | |
1416 { | |
1417 assert (Len - Coord + 1 > 0); | |
1418 return (Len - Coord + 1); | |
1419 } | |
1420 | |
1421 | |
1422 | |
1423 | |
1424 void printHelp | |
1425 (const char * s) | |
1426 | |
1427 // Display the program's help information to stderr. | |
1428 | |
1429 { | |
1430 fprintf (stderr, | |
1431 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); | |
1432 fprintf (stderr, | |
1433 "-b int set the alignment break (give-up) length to int\n" | |
1434 "-B int set the diagonal banding for extension to int\n" | |
1435 "-d output only match clusters rather than extended alignments\n" | |
1436 "-e do not extend alignments outward from clusters\n" | |
1437 "-h display help information\n" | |
1438 "-s don't remove shadowed alignments, useful for aligning a\n" | |
1439 " sequence to itself to identify repeats\n" | |
1440 "-t force alignment to ends of sequence if within -b distance\n\n"); | |
1441 fprintf (stderr, | |
1442 " Input is the output of the \"mgaps\" program from stdin, and\n" | |
1443 "the two original NUCmer sequence files passed on the command\n" | |
1444 "line. <pfx> is the prefix to be added to the front of the\n" | |
1445 "output file <pfx>.delta\n" | |
1446 " <pfx>.delta is the alignment object that catalogs the distance\n" | |
1447 "between insertions and deletions. For further information\n" | |
1448 "regarding this file, please refer to the documentation under\n" | |
1449 "the .delta output description.\n\n"); | |
1450 return; | |
1451 } | |
1452 | |
1453 | |
1454 | |
1455 | |
1456 void printUsage | |
1457 (const char * s) | |
1458 | |
1459 // Display the program's usage information to stderr. | |
1460 | |
1461 { | |
1462 fprintf (stderr, | |
1463 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s); | |
1464 fprintf (stderr, "Try '%s -h' for more information.\n", s); | |
1465 return; | |
1466 } | |
1467 | |
1468 | |
1469 | |
1470 | |
1471 void validateData | |
1472 (vector<Alignment> Alignments, vector<Cluster> Clusters, | |
1473 const FastaRecord * Af, const FastaRecord * Bf) | |
1474 | |
1475 // Self test function to check that the cluster and alignment information | |
1476 // is valid | |
1477 | |
1478 { | |
1479 char * A, * B, * Brev = NULL; | |
1480 vector<Cluster>::iterator Cp; | |
1481 vector<Match>::iterator Mp; | |
1482 vector<Alignment>::iterator Ap; | |
1483 long int x, y, i; | |
1484 char Xc, Yc; | |
1485 | |
1486 A = Af->seq; | |
1487 | |
1488 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) | |
1489 { | |
1490 assert ( Cp->wasFused ); | |
1491 | |
1492 //-- Pick the right directional sequence for B | |
1493 if ( Cp->dirB == FORWARD_CHAR ) | |
1494 B = Bf->seq; | |
1495 else if ( Brev != NULL ) | |
1496 B = Brev; | |
1497 else | |
1498 { | |
1499 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); | |
1500 strcpy ( Brev + 1, Bf->seq + 1 ); | |
1501 Brev[0] = '\0'; | |
1502 Reverse_Complement (Brev, 1, Bf->len); | |
1503 B = Brev; | |
1504 } | |
1505 | |
1506 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) | |
1507 { | |
1508 //-- assert for each match in cluster, it is indeed a match | |
1509 x = Mp->sA; | |
1510 y = Mp->sB; | |
1511 for ( i = 0; i < Mp->len; i ++ ) | |
1512 assert ( A[x ++] == B[y ++] ); | |
1513 | |
1514 //-- assert for each match in cluster, it is contained in an alignment | |
1515 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1516 { | |
1517 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && | |
1518 Ap->eA >= Mp->sA + Mp->len - 1 && | |
1519 Ap->eB >= Mp->sB + Mp->len - 1 ) | |
1520 break; | |
1521 } | |
1522 assert ( Ap < Alignments.end( ) ); | |
1523 } | |
1524 } | |
1525 | |
1526 //-- assert alignments are optimal (quick check if first and last chars equal) | |
1527 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) | |
1528 { | |
1529 if ( Ap->dirB == REVERSE_CHAR ) | |
1530 { | |
1531 assert (Brev != NULL); | |
1532 B = Brev; | |
1533 } | |
1534 else | |
1535 B = Bf->seq; | |
1536 | |
1537 assert ( Ap->sA <= Ap->eA ); | |
1538 assert ( Ap->sB <= Ap->eB ); | |
1539 | |
1540 assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); | |
1541 assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); | |
1542 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); | |
1543 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); | |
1544 | |
1545 Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR); | |
1546 Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR); | |
1547 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); | |
1548 | |
1549 Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR); | |
1550 Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR); | |
1551 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); | |
1552 } | |
1553 | |
1554 if ( Brev != NULL ) | |
1555 free (Brev); | |
1556 | |
1557 return; | |
1558 } |