Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/combineMUMs.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 /************************************************* | |
2 * Module: CombineMUMs.cc | |
3 * Author: Art Delcher | |
4 * Description: | |
5 * Take clusters of MUMs (from mgaps program) and do alignments | |
6 * to combine them into matches. | |
7 * | |
8 */ | |
9 | |
10 | |
11 #include "tigrinc.hh" | |
12 #include <string> | |
13 using namespace std; | |
14 | |
15 | |
16 // Constants | |
17 | |
18 #define BRANCH_PT_MATCH_VALUE 0.272 | |
19 // Value to add for a match in finding branch points | |
20 // 1.20 was the calculated value for 6% vs 35% error discrimination | |
21 // Converting to integers didn't make it faster | |
22 #define BRANCH_PT_ERROR_VALUE -0.728 | |
23 // Value to add for a mismatch in finding branch points | |
24 // -2.19 was the calculated value for 6% vs 35% error discrimination | |
25 // Converting to integers didn't make it faster | |
26 #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps" | |
27 // Name of file produced that matches the input gaps file | |
28 // but with an extra column showing the number of errors in | |
29 // each gap | |
30 #define DEFAULT_PAD 10 | |
31 // Default number of matching bases to show at beginning | |
32 // and end of alignments | |
33 #define EDIT_DIST_PROB_BOUND 1e-4 | |
34 // Probability limit to "band" edit-distance calculation | |
35 // Determines NORMAL_DISTRIB_THOLD | |
36 #define ERRORS_FOR_FREE 1 | |
37 // The number of errors that are ignored in setting probability | |
38 // bound for terminating alignment extensions in edit distance | |
39 // calculations | |
40 #define EXPANSION_FACTOR 1.4 | |
41 // Factor by which to grow memory when realloc'ing | |
42 #define GIVE_UP_LEN 200 | |
43 // Stop alignment when go this many bases past max-score value | |
44 #define INITIAL_BUFFER_SIZE 10000 | |
45 // Initial number of bytes to use for sequence strings | |
46 #define MATCH_SCORE 1.0 | |
47 // Score to add for match in finding length of alignment | |
48 #define MAX_ERROR_RATE 0.06 | |
49 // The largest error allowed in overlaps | |
50 #define MAX_FRAG_LEN 1024 | |
51 // The longest fragment allowed | |
52 #define MAX_ERRORS 500 | |
53 // Most errors in any edit distance computation | |
54 #define MAX_EXTENSION 10000 | |
55 // Maximum extension match out from a match | |
56 #define MAX_HDR_LEN 10000 | |
57 // Longest allowable fasta header line | |
58 #define MAX_MEMORY_STORE 50000 | |
59 // The most fragments allowed in memory store | |
60 #define MIN_BRANCH_END_DIST 20 | |
61 // Branch points must be at least this many bases from the | |
62 // end of the fragment to be reported | |
63 #define MIN_BRANCH_TAIL_SLOPE 0.20 | |
64 // Branch point tails must fall off from the max by at least | |
65 // this rate | |
66 #define MIN_MATCH_LEN 40 | |
67 // Number of bases in match region in order to count it | |
68 #define MISMATCH_SCORE -3.0 | |
69 // Score to add for non-match in finding length of alignment | |
70 #define NORMAL_DISTRIB_THOLD 3.62 | |
71 // Determined by EDIT_DIST_PROB_BOUND | |
72 #define REALLY_VERBOSE 0 | |
73 // If 1 prints tons of stuff | |
74 #define VERBOSE 0 | |
75 // If 1 prints lots of stuff | |
76 | |
77 | |
78 | |
79 // Type definitions | |
80 | |
81 typedef struct s_Cover_t | |
82 { | |
83 long int lo, hi; | |
84 struct s_Cover_t * next; | |
85 } Cover_t; | |
86 | |
87 | |
88 | |
89 // Globals | |
90 | |
91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE; | |
92 // Score used for matches to locate branch points, i.e., | |
93 // where alignment score peaks | |
94 float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE; | |
95 // Score used for mismatches to locate branch points, i.e., | |
96 int Consec_Non_ACGT = 0; | |
97 // Stop alignments when encounter at least this many non-acgt characters. | |
98 int * Edit_Array [MAX_ERRORS]; | |
99 // Use for alignment calculation. Points into Edit_Space . | |
100 int Edit_Match_Limit [MAX_ERRORS] = {0}; | |
101 // This array [e] is the minimum value of Edit_Array [e] [d] | |
102 // to be worth pursuing in edit-distance computations between guides | |
103 int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS]; | |
104 // Memory used by alignment calculation | |
105 int Error_Bound [MAX_FRAG_LEN + 1]; | |
106 // This array [i] is the maximum number of errors allowed | |
107 // in a match between sequences of length i , which is | |
108 // i * MAXERROR_RATE . | |
109 char * Error_File_Name = DEFAULT_ERROR_FILE_NAME; | |
110 // Name of file to write gaps listing with # errors in each gap | |
111 int Fill_Ct = 0; | |
112 // Number of non-acgt bases in ref sequence | |
113 char * Gaps_File_Path = NULL; | |
114 // Name of file produced by mgaps program | |
115 char * Match_File_Path = NULL; | |
116 // Name of multifasta file of sequences to compare against the reference | |
117 // sequence | |
118 int UserScoring = FALSE; | |
119 // If TRUE, then user specified a percent ID cutoff and scoring | |
120 // is adjusted to enforce this in extensions | |
121 int Nucleotides_Only = FALSE; | |
122 // If TRUE , then only acgt's can match | |
123 bool Only_Difference_Positions = false; | |
124 // If true , then output just positions of difference instead of | |
125 // alignments between exact matches | |
126 int Output_Cover_Files = TRUE; | |
127 // If TRUE , output files showing coverage of each genome. | |
128 double Percent_ID; | |
129 // Desired identity rate of matches to be found | |
130 // Expressed as a fraction, not really a percent | |
131 char * Query = NULL; | |
132 // The query sequence | |
133 long int Query_Len; | |
134 // The length of the query sequence | |
135 char * Query_Suffix = "Query"; | |
136 // Suffix for query tag | |
137 char * Ref = NULL; | |
138 // The reference sequence | |
139 char * Ref_File_Path = NULL; | |
140 // Name of (single) fasta file of reference sequence | |
141 long int Ref_Len; | |
142 // The length of the reference sequence | |
143 long int Ref_Size; | |
144 // The size of the reference sequence buffer | |
145 char * Ref_Suffix = "Ref"; | |
146 // Suffix for reference tag | |
147 int Show_Differences = FALSE; | |
148 // If TRUE then show differences in all alignments | |
149 int Tag_From_Fasta_Line = FALSE; | |
150 // If TRUE then use fasta tag from ref & query sequences as | |
151 // 1st & 2nd column, resp., on match line to identify matches | |
152 int Verbose = 0; | |
153 // Controls printing of extra debuggin information | |
154 | |
155 | |
156 bool Global_Debug_Flag = false; | |
157 | |
158 | |
159 | |
160 // Functions | |
161 | |
162 void Add_Coverage | |
163 (Cover_t * * list, long int lo, long int hi); | |
164 int Binomial_Bound | |
165 (int, double, int, double); | |
166 void Display_Alignment | |
167 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); | |
168 void Display_Alignment_With_Pad | |
169 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
170 char * left_pad [3], char * right_pad [3], int pad_len); | |
171 void Display_Difference_Positions | |
172 (char * a, int a_start, int a_len, char * b, int b_start, int b_len, | |
173 int b_incr, int delta [], int delta_ct, const char * a_tag, | |
174 const char * b_tag); | |
175 int Extend_Backward | |
176 (long int * ref_lo, long int * query_lo); | |
177 int Extend_Forward | |
178 (long int * ref_hi, long int * query_hi); | |
179 void Initialize_Globals | |
180 (void); | |
181 FILE * Local_File_Open | |
182 (const char * filename, const char * mode); | |
183 int Max_int | |
184 (int a, int b); | |
185 int Min_int | |
186 (int a, int b); | |
187 void Parse_Command_Line | |
188 (int argc, char * argv []); | |
189 int Prefix_Edit_Dist | |
190 (char A [], int m, char T [], int n, int Error_Limit, | |
191 int * A_End, int * T_End, int * Match_To_End, | |
192 int Delta [MAX_ERRORS], int * Delta_Len, int extending); | |
193 int Read_String | |
194 (FILE * fp, char * * T, long int * Size, char header []); | |
195 void Rev_Complement | |
196 (char * s); | |
197 void Rev_Display_Alignment | |
198 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); | |
199 int Rev_Prefix_Edit_Dist | |
200 (char A [], int m, char T [], int n, int Error_Limit, | |
201 int * A_End, int * T_End, int * Match_To_End, | |
202 int Delta [MAX_ERRORS], int * Delta_Len, int extending); | |
203 void Rev_Show_Diffs | |
204 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
205 long int a_ref, long int b_ref); | |
206 void Set_Deltas | |
207 (int delta [], int * delta_len, int row, int d, int e); | |
208 static void Set_Left_Pad | |
209 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len); | |
210 static void Set_Right_Pad | |
211 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len); | |
212 void Show_Coverage | |
213 (Cover_t * list, char * filename, char * tag, char * seq); | |
214 void Show_Diffs | |
215 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
216 long int a_ref, long int b_ref); | |
217 int Sign | |
218 (int a); | |
219 void Usage | |
220 (char * command); | |
221 | |
222 | |
223 | |
224 int main | |
225 (int argc, char * argv []) | |
226 | |
227 { | |
228 FILE * fp, * match_fp, * error_fp; | |
229 char ref_header [MAX_HDR_LEN]; | |
230 char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN]; | |
231 char line [MAX_LINE], filename [MAX_LINE]; | |
232 char * left_pad [3], * right_pad [3]; | |
233 long int ref_pos, ref_lo, ref_hi; | |
234 long int query_pos, query_lo, query_hi, query_size; | |
235 long int match_len, prior_match_len = 0, max_len; | |
236 double error = 0.0; | |
237 long int total_errors = 0; | |
238 int match_ct = 0; | |
239 double match_total = 0.0; | |
240 int is_forward = FALSE; | |
241 int first_match = TRUE; | |
242 Cover_t * ref_cover_list = NULL; | |
243 Cover_t * query_cover_list = NULL; | |
244 char * p; | |
245 int i; | |
246 | |
247 Parse_Command_Line (argc, argv); | |
248 | |
249 if(UserScoring){ | |
250 // percentID = mismatch penalty / (match bonus + mismatch penalty) | |
251 // or, more compactly, p = mm / (m+mm) | |
252 // Since we require that m+mm = 1, m=1-mm | |
253 // and so we get the trivial mm = p | |
254 // | |
255 // This means that Branch_Pt_Error_Value = -p | |
256 // and then Branch_Pt_Match_Value = 1-p | |
257 // | |
258 // Make it so: | |
259 Branch_Pt_Error_Value = - Percent_ID; | |
260 Branch_Pt_Match_Value = 1.0 - Percent_ID; | |
261 } | |
262 | |
263 Initialize_Globals (); | |
264 | |
265 p = strrchr (Ref_File_Path, '/'); | |
266 if (p == NULL) | |
267 strcpy (ref_tag, Ref_File_Path); | |
268 else | |
269 strcpy (ref_tag, p + 1); | |
270 p = strrchr (ref_tag, '.'); | |
271 if (p != NULL) | |
272 (* p) = '\0'; | |
273 strcat (ref_tag, Ref_Suffix); | |
274 | |
275 p = strrchr (Match_File_Path, '/'); | |
276 if (p == NULL) | |
277 strcpy (query_tag, Match_File_Path); | |
278 else | |
279 strcpy (query_tag, p + 1); | |
280 p = strrchr (query_tag, '.'); | |
281 if (p != NULL) | |
282 (* p) = '\0'; | |
283 strcat (query_tag, Query_Suffix); | |
284 | |
285 fp = Local_File_Open (Ref_File_Path, "r"); | |
286 Ref_Size = 100000; | |
287 Ref = (char *) Safe_malloc (Ref_Size); | |
288 | |
289 assert (Read_String (fp, & Ref, & Ref_Size, ref_header)); | |
290 if (Tag_From_Fasta_Line) | |
291 strcpy (ref_tag, strtok (ref_header, " \t\n>")); | |
292 fclose (fp); | |
293 | |
294 Ref_Len = strlen (Ref + 1); | |
295 Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len); | |
296 for (i = 1; i <= Ref_Len; i ++) | |
297 switch (Ref [i]) | |
298 { | |
299 case 'a' : | |
300 case 'c' : | |
301 case 'g' : | |
302 case 't' : | |
303 break; | |
304 default : | |
305 if (Nucleotides_Only) | |
306 Ref [i] = '2'; | |
307 Fill_Ct ++; | |
308 } | |
309 | |
310 for (i = 0; i < 3; i ++) | |
311 { | |
312 left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); | |
313 right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); | |
314 } | |
315 | |
316 fp = Local_File_Open (Gaps_File_Path, "r"); | |
317 match_fp = Local_File_Open (Match_File_Path, "r"); | |
318 query_size = 100000; | |
319 Query = (char *) Safe_malloc (query_size); | |
320 | |
321 error_fp = Local_File_Open (Error_File_Name, "w"); | |
322 | |
323 while (fgets (line, MAX_LINE, fp) != NULL) | |
324 { | |
325 int line_len = strlen (line); | |
326 | |
327 if (line [0] == '>') | |
328 { | |
329 if (! first_match) | |
330 { | |
331 total_errors += Extend_Forward (& ref_hi, & query_hi); | |
332 max_len = 1 + ref_hi - ref_lo; | |
333 if (1 + abs (query_hi - query_lo) > max_len) | |
334 max_len = 1 + abs (query_hi - query_lo); | |
335 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); | |
336 if (! Only_Difference_Positions) | |
337 printf | |
338 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", | |
339 ref_lo, ref_hi, | |
340 is_forward ? query_lo : 1 + Query_Len - query_lo, | |
341 is_forward ? query_hi : 1 + Query_Len - query_hi, | |
342 total_errors, max_len, 100.0 * error); | |
343 | |
344 match_ct ++; | |
345 match_total += 1 + ref_hi - ref_lo; | |
346 total_errors = 0; | |
347 Add_Coverage (& ref_cover_list, ref_lo, ref_hi); | |
348 if (is_forward) | |
349 Add_Coverage (& query_cover_list, query_lo, query_hi); | |
350 else | |
351 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, | |
352 1 + Query_Len - query_lo); | |
353 } | |
354 if (Tag_From_Fasta_Line) | |
355 { | |
356 char buff [MAX_LINE]; | |
357 char * p; | |
358 | |
359 strcpy (buff, line + 1); | |
360 p = strtok (buff, " >\t\n"); | |
361 if (strlen (p) > 0) | |
362 strcpy (query_tag, p); | |
363 else | |
364 { | |
365 fprintf (stderr, "No tag on line %s", line); | |
366 strcpy (query_tag, "???"); | |
367 } | |
368 } | |
369 is_forward = ! is_forward; | |
370 if (is_forward) | |
371 { | |
372 assert (Read_String (match_fp, & Query, & query_size, header)); | |
373 Query_Len = strlen (Query + 1); | |
374 if (Nucleotides_Only) | |
375 { | |
376 for (i = 1; i <= Query_Len; i ++) | |
377 switch (Query [i]) | |
378 { | |
379 case 'a' : | |
380 case 'c' : | |
381 case 'g' : | |
382 case 't' : | |
383 break; | |
384 default : | |
385 Query [i] = '1'; | |
386 } | |
387 } | |
388 } | |
389 else | |
390 Rev_Complement (Query + 1); | |
391 first_match = TRUE; | |
392 printf ("%s", line); | |
393 fprintf (error_fp, "%s", line); | |
394 } | |
395 else if (line [0] == '#') | |
396 { | |
397 if (! first_match) | |
398 { | |
399 total_errors += Extend_Forward (& ref_hi, & query_hi); | |
400 max_len = 1 + ref_hi - ref_lo; | |
401 if (1 + abs (query_hi - query_lo) > max_len) | |
402 max_len = 1 + abs (query_hi - query_lo); | |
403 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); | |
404 if (! Only_Difference_Positions) | |
405 printf | |
406 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", | |
407 ref_lo, ref_hi, | |
408 is_forward ? query_lo : 1 + Query_Len - query_lo, | |
409 is_forward ? query_hi : 1 + Query_Len - query_hi, | |
410 total_errors, max_len, 100.0 * error); | |
411 | |
412 match_ct ++; | |
413 match_total += 1 + ref_hi - ref_lo; | |
414 total_errors = 0; | |
415 Add_Coverage (& ref_cover_list, ref_lo, ref_hi); | |
416 if (is_forward) | |
417 Add_Coverage (& query_cover_list, query_lo, query_hi); | |
418 else | |
419 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, | |
420 1 + Query_Len - query_lo); | |
421 } | |
422 first_match = TRUE; | |
423 printf ("%s", line); | |
424 fprintf (error_fp, "%s", line); | |
425 } | |
426 else | |
427 { | |
428 assert (sscanf (line, "%ld %ld %ld", | |
429 & ref_pos, & query_pos, & match_len) == 3); | |
430 if (first_match) | |
431 { | |
432 ref_lo = ref_pos; | |
433 query_lo = query_pos; | |
434 total_errors += Extend_Backward (& ref_lo, & query_lo); | |
435 if (! Only_Difference_Positions) | |
436 printf ("%s", line); | |
437 if (line_len > 0) | |
438 line [line_len - 1] = '\0'; | |
439 fprintf (error_fp, "%s %7s\n", line, "-"); | |
440 } | |
441 else | |
442 { | |
443 int errors; | |
444 int a_len, b_len; | |
445 int a_end, b_end, match_to_end; | |
446 int delta [MAX_ERRORS], delta_len; | |
447 | |
448 a_len = ref_pos - ref_hi - 1; | |
449 b_len = query_pos - query_hi - 1; | |
450 | |
451 errors = Prefix_Edit_Dist | |
452 (Ref + ref_hi + 1, a_len, | |
453 Query + query_hi + 1, b_len, | |
454 MAX_ERRORS - 1, & a_end, & b_end, | |
455 & match_to_end, delta, & delta_len, FALSE); | |
456 if (Show_Differences) | |
457 Show_Diffs (Ref + ref_hi + 1, a_end, | |
458 Query + query_hi + 1, b_end, | |
459 delta, delta_len, ref_hi + 1, query_hi + 1); | |
460 | |
461 if (! Only_Difference_Positions) | |
462 printf ("%s", line); | |
463 | |
464 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len, | |
465 DEFAULT_PAD); | |
466 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len, | |
467 DEFAULT_PAD); | |
468 | |
469 if (a_end == 0 && b_end == 0) | |
470 { | |
471 if (! Only_Difference_Positions) | |
472 printf (" No extension\n"); | |
473 | |
474 if (line_len > 0) | |
475 line [line_len - 1] = '\0'; | |
476 fprintf (error_fp, "%s %7s\n", line, "-"); | |
477 } | |
478 else if (Only_Difference_Positions) | |
479 { | |
480 int q_start, b_incr; | |
481 | |
482 if (is_forward) | |
483 { | |
484 q_start = query_hi + 1; | |
485 b_incr = 1; | |
486 } | |
487 else | |
488 { | |
489 q_start = Query_Len - query_hi; | |
490 b_incr = -1; | |
491 } | |
492 Display_Difference_Positions | |
493 (Ref + ref_hi + 1, ref_hi + 1, a_end, | |
494 Query + query_hi + 1, q_start, b_end, | |
495 b_incr, delta, delta_len, ref_tag, | |
496 query_tag); | |
497 } | |
498 else | |
499 { | |
500 printf (" Errors = %d\n", errors); | |
501 if (! Only_Difference_Positions) | |
502 Display_Alignment_With_Pad | |
503 (Ref + ref_hi + 1, a_end, | |
504 Query + query_hi + 1, b_end, | |
505 delta, delta_len, left_pad, right_pad, | |
506 DEFAULT_PAD); | |
507 if (line_len > 0) | |
508 line [line_len - 1] = '\0'; | |
509 fprintf (error_fp, "%s %7d\n", line, errors); | |
510 } | |
511 | |
512 total_errors += errors; | |
513 | |
514 if (! match_to_end) | |
515 { | |
516 ref_hi += a_end; | |
517 query_hi += b_end; | |
518 max_len = 1 + ref_hi - ref_lo; | |
519 if (1 + abs (query_hi - query_lo) > max_len) | |
520 max_len = 1 + abs (query_hi - query_lo); | |
521 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); | |
522 if (! Only_Difference_Positions) | |
523 printf ( | |
524 "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", | |
525 ref_lo, ref_hi, | |
526 is_forward ? query_lo : 1 + Query_Len - query_lo, | |
527 is_forward ? query_hi : 1 + Query_Len - query_hi, | |
528 total_errors, max_len, 100.0 * error); | |
529 | |
530 match_ct ++; | |
531 match_total += 1 + ref_hi - ref_lo; | |
532 total_errors = 0; | |
533 Add_Coverage (& ref_cover_list, ref_lo, ref_hi); | |
534 if (is_forward) | |
535 Add_Coverage (& query_cover_list, query_lo, query_hi); | |
536 else | |
537 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, | |
538 1 + Query_Len - query_lo); | |
539 ref_lo = ref_pos; | |
540 query_lo = query_pos; | |
541 total_errors += Extend_Backward (& ref_lo, & query_lo); | |
542 } | |
543 } | |
544 | |
545 ref_hi = ref_pos + match_len - 1; | |
546 query_hi = query_pos + match_len - 1; | |
547 prior_match_len = match_len; | |
548 first_match = FALSE; | |
549 } | |
550 } | |
551 | |
552 if (! first_match) | |
553 { | |
554 total_errors += Extend_Forward (& ref_hi, & query_hi); | |
555 max_len = 1 + ref_hi - ref_lo; | |
556 if (1 + abs (query_hi - query_lo) > max_len) | |
557 max_len = 1 + abs (query_hi - query_lo); | |
558 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); | |
559 if (! Only_Difference_Positions) | |
560 printf | |
561 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", | |
562 ref_lo, ref_hi, | |
563 is_forward ? query_lo : 1 + Query_Len - query_lo, | |
564 is_forward ? query_hi : 1 + Query_Len - query_hi, | |
565 total_errors, max_len, 100.0 * error); | |
566 | |
567 match_ct ++; | |
568 match_total += 1 + ref_hi - ref_lo; | |
569 Add_Coverage (& ref_cover_list, ref_lo, ref_hi); | |
570 if (is_forward) | |
571 Add_Coverage (& query_cover_list, query_lo, query_hi); | |
572 else | |
573 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, | |
574 1 + Query_Len - query_lo); | |
575 } | |
576 | |
577 fprintf (stderr, " Ref len = %ld\n", Ref_Len); | |
578 fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct); | |
579 fprintf (stderr, " Non acgt's = %d\n", Fill_Ct); | |
580 fprintf (stderr, " Number of matches = %d\n", match_ct); | |
581 fprintf (stderr, "Sum of match bases = %.0f\n", match_total); | |
582 fprintf (stderr, " Avg match bases = %.0f\n", | |
583 match_ct == 0 ? 0.0 : match_total / match_ct); | |
584 | |
585 fclose (error_fp); | |
586 | |
587 if (Output_Cover_Files) | |
588 { | |
589 strcpy (filename, ref_tag); | |
590 strcat (filename, ".cover"); | |
591 Show_Coverage (ref_cover_list, filename, ref_tag, Ref); | |
592 | |
593 strcpy (filename, query_tag); | |
594 strcat (filename, ".cover"); | |
595 Rev_Complement (Query + 1); | |
596 Show_Coverage (query_cover_list, filename, query_tag, Query); | |
597 } | |
598 | |
599 return 0; | |
600 } | |
601 | |
602 | |
603 | |
604 void Add_Coverage | |
605 (Cover_t * * list, long int lo, long int hi) | |
606 | |
607 // Add lo .. hi to list of regions covered in (* list) . | |
608 // Combine nodes when appropriate. | |
609 | |
610 { | |
611 Cover_t * new_node, * p, * prev = NULL; | |
612 | |
613 if ((* list) == NULL || hi + 1 < (* list) -> lo) | |
614 { | |
615 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); | |
616 new_node -> lo = lo; | |
617 new_node -> hi = hi; | |
618 new_node -> next = (* list); | |
619 (* list) = new_node; | |
620 return; | |
621 } | |
622 | |
623 for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next) | |
624 prev = p; | |
625 | |
626 if (p == NULL || hi + 1 < p -> lo) | |
627 { // insert between or on end | |
628 assert (prev != NULL); | |
629 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); | |
630 new_node -> lo = lo; | |
631 new_node -> hi = hi; | |
632 new_node -> next = p; | |
633 prev -> next = new_node; | |
634 return; | |
635 } | |
636 | |
637 if (lo < p -> lo) | |
638 p -> lo = lo; | |
639 while (p -> next != NULL && hi + 1 >= p -> next -> lo) | |
640 { | |
641 Cover_t * save; | |
642 | |
643 p -> hi = p -> next -> hi; | |
644 save = p -> next; | |
645 p -> next = save -> next; | |
646 free (save); | |
647 } | |
648 if (hi > p -> hi) | |
649 p -> hi = hi; | |
650 | |
651 return; | |
652 } | |
653 | |
654 | |
655 | |
656 int Binomial_Bound | |
657 (int e, double p, int Start, double Limit) | |
658 | |
659 // Return the smallest n >= Start s.t. | |
660 // prob [>= e errors in n binomial trials (p = error prob)] | |
661 // > Limit | |
662 | |
663 { | |
664 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff; | |
665 double q, Sum, P_Power, Q_Power, X; | |
666 int k, n, Bin_Coeff, Ct; | |
667 | |
668 q = 1.0 - p; | |
669 if (Start < e) | |
670 Start = e; | |
671 | |
672 for (n = Start; n < MAX_FRAG_LEN; n ++) | |
673 { | |
674 if (n <= 35) | |
675 { | |
676 Sum = 0.0; | |
677 Bin_Coeff = 1; | |
678 Ct = 0; | |
679 P_Power = 1.0; | |
680 Q_Power = pow (q, (double) n); | |
681 | |
682 for (k = 0; k < e && 1.0 - Sum > Limit; k ++) | |
683 { | |
684 X = Bin_Coeff * P_Power * Q_Power; | |
685 Sum += X; | |
686 Bin_Coeff *= n - Ct; | |
687 Bin_Coeff /= ++ Ct; | |
688 P_Power *= p; | |
689 Q_Power /= q; | |
690 } | |
691 if (1.0 - Sum > Limit) | |
692 return n; | |
693 } | |
694 else | |
695 { | |
696 Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q)); | |
697 if (Normal_Z <= NORMAL_DISTRIB_THOLD) | |
698 return n; | |
699 Sum = 0.0; | |
700 Mu_Power = 1.0; | |
701 Factorial = 1.0; | |
702 Poisson_Coeff = exp (- (double) n * p); | |
703 for (k = 0; k < e; k ++) | |
704 { | |
705 Sum += Mu_Power * Poisson_Coeff / Factorial; | |
706 Mu_Power *= n * p; | |
707 Factorial *= k + 1; | |
708 } | |
709 if (1.0 - Sum > Limit) | |
710 return n; | |
711 } | |
712 } | |
713 | |
714 return MAX_FRAG_LEN; | |
715 } | |
716 | |
717 | |
718 | |
719 #define DISPLAY_WIDTH 60 | |
720 | |
721 void Display_Alignment | |
722 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) | |
723 | |
724 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] | |
725 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . | |
726 | |
727 { | |
728 int i, j, k, m, top_len, bottom_len; | |
729 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
730 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
731 | |
732 i = j = top_len = bottom_len = 0; | |
733 for (k = 0; k < delta_ct; k ++) | |
734 { | |
735 for (m = 1; m < abs (delta [k]); m ++) | |
736 { | |
737 top [top_len ++] = a [i ++]; | |
738 j ++; | |
739 } | |
740 if (delta [k] < 0) | |
741 { | |
742 top [top_len ++] = '-'; | |
743 j ++; | |
744 } | |
745 else | |
746 { | |
747 top [top_len ++] = a [i ++]; | |
748 } | |
749 } | |
750 while (i < a_len && j < b_len) | |
751 { | |
752 top [top_len ++] = a [i ++]; | |
753 j ++; | |
754 } | |
755 top [top_len] = '\0'; | |
756 | |
757 | |
758 i = j = 0; | |
759 for (k = 0; k < delta_ct; k ++) | |
760 { | |
761 for (m = 1; m < abs (delta [k]); m ++) | |
762 { | |
763 bottom [bottom_len ++] = b [j ++]; | |
764 i ++; | |
765 } | |
766 if (delta [k] > 0) | |
767 { | |
768 bottom [bottom_len ++] = '-'; | |
769 i ++; | |
770 } | |
771 else | |
772 { | |
773 bottom [bottom_len ++] = b [j ++]; | |
774 } | |
775 } | |
776 while (j < b_len && i < a_len) | |
777 { | |
778 bottom [bottom_len ++] = b [j ++]; | |
779 i ++; | |
780 } | |
781 bottom [bottom_len] = '\0'; | |
782 | |
783 | |
784 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) | |
785 { | |
786 printf ("A: "); | |
787 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) | |
788 putchar (top [i + j]); | |
789 putchar ('\n'); | |
790 printf ("B: "); | |
791 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) | |
792 putchar (bottom [i + j]); | |
793 putchar ('\n'); | |
794 printf (" "); | |
795 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; | |
796 j ++) | |
797 if (top [i + j] != ' ' && bottom [i + j] != ' ' | |
798 && top [i + j] != bottom [i + j]) | |
799 putchar ('^'); | |
800 else | |
801 putchar (' '); | |
802 putchar ('\n'); | |
803 } | |
804 | |
805 return; | |
806 } | |
807 | |
808 | |
809 | |
810 void Display_Alignment_With_Pad | |
811 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
812 char * left_pad [3], char * right_pad [3], int pad_len) | |
813 | |
814 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] | |
815 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . | |
816 // Attach pad strings in left_pad to the beginning of the alignment | |
817 // and the strings in right_pad to the end of the alignment. | |
818 // pad_len is the width of the pad strings | |
819 | |
820 { | |
821 int i, j, k, m, top_len, bottom_len; | |
822 #if 0 | |
823 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
824 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
825 #else | |
826 string top, bottom; | |
827 #endif | |
828 | |
829 for (k = 0; k < pad_len; k ++) | |
830 { | |
831 top . push_back (left_pad [0] [k]); | |
832 bottom . push_back (left_pad [1] [k]); | |
833 } | |
834 | |
835 i = j = 0; | |
836 for (k = 0; k < delta_ct; k ++) | |
837 { | |
838 for (m = 1; m < abs (delta [k]); m ++) | |
839 { | |
840 top . push_back (a [i ++]); | |
841 j ++; | |
842 } | |
843 if (delta [k] < 0) | |
844 { | |
845 top . push_back ('-'); | |
846 j ++; | |
847 } | |
848 else | |
849 { | |
850 top . push_back (a [i ++]); | |
851 } | |
852 } | |
853 while (i < a_len && j < b_len) | |
854 { | |
855 top . push_back (a [i ++]); | |
856 j ++; | |
857 } | |
858 | |
859 i = j = 0; | |
860 for (k = 0; k < delta_ct; k ++) | |
861 { | |
862 for (m = 1; m < abs (delta [k]); m ++) | |
863 { | |
864 bottom . push_back (b [j ++]); | |
865 i ++; | |
866 } | |
867 if (delta [k] > 0) | |
868 { | |
869 bottom . push_back ('-'); | |
870 i ++; | |
871 } | |
872 else | |
873 { | |
874 bottom . push_back (b [j ++]); | |
875 } | |
876 } | |
877 while (j < b_len && i < a_len) | |
878 { | |
879 bottom . push_back (b [j ++]); | |
880 i ++; | |
881 } | |
882 | |
883 for (k = 0; k < pad_len; k ++) | |
884 { | |
885 top . push_back (right_pad [0] [k]); | |
886 bottom . push_back (right_pad [1] [k]); | |
887 } | |
888 | |
889 top_len = top . length (); | |
890 bottom_len = bottom . length (); | |
891 | |
892 if (top_len != bottom_len) | |
893 { | |
894 fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n", | |
895 top_len, bottom_len); | |
896 exit (EXIT_FAILURE); | |
897 } | |
898 | |
899 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) | |
900 { | |
901 printf ("A: "); | |
902 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) | |
903 putchar (top [i + j]); | |
904 putchar ('\n'); | |
905 printf ("B: "); | |
906 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) | |
907 putchar (bottom [i + j]); | |
908 putchar ('\n'); | |
909 printf (" "); | |
910 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; | |
911 j ++) | |
912 if (i + j < pad_len) | |
913 putchar (left_pad [2] [i + j]); | |
914 else if (top_len - i - j <= pad_len) | |
915 putchar (right_pad [2] [i + j - (top_len - pad_len)]); | |
916 else if (top [i + j] != ' ' && bottom [i + j] != ' ' | |
917 && top [i + j] != bottom [i + j]) | |
918 putchar ('^'); | |
919 else | |
920 putchar (' '); | |
921 putchar ('\n'); | |
922 } | |
923 | |
924 return; | |
925 } | |
926 | |
927 | |
928 | |
929 void Display_Difference_Positions | |
930 (char * a, int a_start, int a_len, char * b, int b_start, int b_len, | |
931 int b_incr, int delta [], int delta_ct, const char * a_tag, | |
932 const char * b_tag) | |
933 | |
934 // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)] | |
935 // between strings a [0 .. (a_len - 1)] and | |
936 // b [0 .. (b_len - 1))] . The positions of the starts of the strings | |
937 // are at a_start and b_start , respectively, and b_incr indicates | |
938 // if the b postions increase or decrease. b_incr must be 1 or -1. | |
939 // Use a_tag and b_tag to label the positions. | |
940 | |
941 { | |
942 int i, j, k, m; | |
943 | |
944 assert (b_incr == 1 || b_incr == -1); | |
945 | |
946 i = j = 0; | |
947 for (k = 0; k < delta_ct; k ++) | |
948 { | |
949 for (m = 1; m < abs (delta [k]); m ++) | |
950 { | |
951 if (a [i] != b [j]) | |
952 printf ("%-15s %-15s %8d %8d %c %c\n", | |
953 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); | |
954 i ++; | |
955 j ++; | |
956 } | |
957 if (delta [k] < 0) | |
958 { | |
959 printf ("%-15s %-15s %8d %8d %c %c\n", | |
960 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]); | |
961 j ++; | |
962 } | |
963 else | |
964 { | |
965 if (b_incr > 0) | |
966 printf ("%-15s %-15s %8d %8d %c %c\n", | |
967 a_tag, b_tag, a_start + i, b_start + j, a [i], '-'); | |
968 else | |
969 printf ("%-15s %-15s %8d %8d %c %c\n", | |
970 a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-'); | |
971 i ++; | |
972 } | |
973 } | |
974 while (i < a_len && j < b_len) | |
975 { | |
976 if (a [i] != b [j]) | |
977 printf ("%-15s %-15s %8d %8d %c %c\n", | |
978 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); | |
979 i ++; | |
980 j ++; | |
981 } | |
982 | |
983 return; | |
984 } | |
985 | |
986 | |
987 | |
988 int Extend_Backward | |
989 (long int * ref_lo, long int * query_lo) | |
990 | |
991 // Do edit distance off end of match beginning at (* ref_lo) and | |
992 // (* query_lo) in the reference and query sequences, resp., | |
993 // in the reverse direction. | |
994 // Return the number of errors in the extension and set (* ref_hi) | |
995 // and (* query_hi) to the end of the extension. | |
996 | |
997 { | |
998 int ct = 0, errors, sum = 0; | |
999 int a_len, b_len; | |
1000 int a_end, b_end, match_to_end; | |
1001 int delta [MAX_ERRORS], delta_len; | |
1002 | |
1003 do | |
1004 { | |
1005 a_len = (* ref_lo) - 1; | |
1006 if (a_len > MAX_EXTENSION) | |
1007 a_len = MAX_EXTENSION; | |
1008 b_len = (* query_lo) - 1; | |
1009 if (b_len > MAX_EXTENSION) | |
1010 b_len = MAX_EXTENSION; | |
1011 | |
1012 errors = Rev_Prefix_Edit_Dist | |
1013 (Ref + (* ref_lo) - 1, a_len, | |
1014 Query + (* query_lo) - 1, b_len, | |
1015 MAX_ERRORS - 1, & a_end, & b_end, | |
1016 & match_to_end, delta, & delta_len, TRUE); | |
1017 if (Show_Differences) | |
1018 Rev_Show_Diffs | |
1019 (Ref + (* ref_lo - 1), a_end, | |
1020 Query + (* query_lo - 1), b_end, | |
1021 delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1); | |
1022 if (Verbose > 0) | |
1023 { | |
1024 printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", | |
1025 ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len, | |
1026 errors, match_to_end ? 'T' : 'F', a_end, b_end); | |
1027 Rev_Display_Alignment | |
1028 (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end, | |
1029 delta, delta_len); | |
1030 } | |
1031 | |
1032 (* ref_lo) -= a_end; | |
1033 (* query_lo) -= b_end; | |
1034 sum += errors; | |
1035 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); | |
1036 | |
1037 return sum; | |
1038 } | |
1039 | |
1040 | |
1041 | |
1042 int Extend_Forward | |
1043 (long int * ref_hi, long int * query_hi) | |
1044 | |
1045 // Do edit distance off end of match ending at (* ref_hi) and | |
1046 // (* query_hi) in the reference and query sequences, resp. | |
1047 // Return the number of errors in the extension and set (* ref_hi) | |
1048 // and (* query_hi) to the end of the extension. | |
1049 | |
1050 { | |
1051 int ct = 0, errors, sum = 0; | |
1052 int a_end, b_end, match_to_end; | |
1053 int a_len, b_len; | |
1054 int delta [MAX_ERRORS], delta_len; | |
1055 | |
1056 do | |
1057 { | |
1058 a_len = Ref_Len - (* ref_hi); | |
1059 if (a_len > MAX_EXTENSION) | |
1060 a_len = MAX_EXTENSION; | |
1061 b_len = Query_Len - (* query_hi); | |
1062 if (b_len > MAX_EXTENSION) | |
1063 b_len = MAX_EXTENSION; | |
1064 | |
1065 errors = Prefix_Edit_Dist | |
1066 (Ref + (* ref_hi) + 1, a_len, | |
1067 Query + (* query_hi) + 1, b_len, | |
1068 MAX_ERRORS - 1, & a_end, & b_end, | |
1069 & match_to_end, delta, & delta_len, TRUE); | |
1070 if (Show_Differences) | |
1071 Show_Diffs (Ref + (* ref_hi) + 1, a_end, | |
1072 Query + (* query_hi) + 1, b_end, | |
1073 delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1); | |
1074 if (Verbose > 0) | |
1075 { | |
1076 printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", | |
1077 ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len, | |
1078 errors, match_to_end ? 'T' : 'F', a_end, b_end); | |
1079 Display_Alignment (Ref + (* ref_hi) + 1, a_end, | |
1080 Query + (* query_hi) + 1, b_end, | |
1081 delta, delta_len); | |
1082 } | |
1083 | |
1084 (* ref_hi) += a_end; | |
1085 (* query_hi) += b_end; | |
1086 sum += errors; | |
1087 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); | |
1088 | |
1089 return sum; | |
1090 } | |
1091 | |
1092 | |
1093 void Initialize_Globals | |
1094 (void) | |
1095 | |
1096 // Initialize global variables used in this program | |
1097 | |
1098 { | |
1099 int i, offset, del; | |
1100 int e, start; | |
1101 | |
1102 offset = 2; | |
1103 del = 6; | |
1104 for (i = 0; i < MAX_ERRORS; i ++) | |
1105 { | |
1106 Edit_Array [i] = Edit_Space + offset; | |
1107 offset += del; | |
1108 del += 2; | |
1109 } | |
1110 | |
1111 | |
1112 for (i = 0; i <= ERRORS_FOR_FREE; i ++) | |
1113 Edit_Match_Limit [i] = 0; | |
1114 | |
1115 start = 1; | |
1116 for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++) | |
1117 { | |
1118 start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE, | |
1119 start, EDIT_DIST_PROB_BOUND); | |
1120 Edit_Match_Limit [e] = start - 1; | |
1121 assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]); | |
1122 } | |
1123 | |
1124 for (i = 0; i <= MAX_FRAG_LEN; i ++) | |
1125 Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE)); | |
1126 | |
1127 return; | |
1128 } | |
1129 | |
1130 | |
1131 | |
1132 FILE * Local_File_Open | |
1133 (const char * filename, const char * mode) | |
1134 | |
1135 /* Open Filename in Mode and return a pointer to its control | |
1136 * block. If fail, print a message and exit. */ | |
1137 | |
1138 { | |
1139 FILE * fp; | |
1140 | |
1141 fp = fopen (filename, mode); | |
1142 if (fp == NULL) | |
1143 { | |
1144 fprintf (stderr, "ERROR %d: Could not open file %s \n", | |
1145 errno, filename); | |
1146 perror (strerror (errno)); | |
1147 exit (EXIT_FAILURE); | |
1148 } | |
1149 | |
1150 return fp; | |
1151 } | |
1152 | |
1153 | |
1154 | |
1155 int Max_int | |
1156 (int a, int b) | |
1157 | |
1158 // Return the larger of a and b . | |
1159 | |
1160 { | |
1161 if (a < b) | |
1162 return b; | |
1163 | |
1164 return a; | |
1165 } | |
1166 | |
1167 | |
1168 | |
1169 int Min_int | |
1170 (int a, int b) | |
1171 | |
1172 // Return the smaller of a and b . | |
1173 | |
1174 { | |
1175 if (a < b) | |
1176 return a; | |
1177 | |
1178 return b; | |
1179 } | |
1180 | |
1181 | |
1182 | |
1183 void Parse_Command_Line | |
1184 (int argc, char * argv []) | |
1185 | |
1186 // Get options and parameters from command line with argc | |
1187 // arguments in argv [0 .. (argc - 1)] . | |
1188 | |
1189 { | |
1190 int ch, errflg = FALSE; | |
1191 | |
1192 optarg = NULL; | |
1193 | |
1194 while (! errflg | |
1195 && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF)) | |
1196 switch (ch) | |
1197 { | |
1198 case 'D' : | |
1199 Only_Difference_Positions = true; | |
1200 break; | |
1201 | |
1202 case 'e' : | |
1203 Percent_ID = 1.0 - atof (optarg); | |
1204 UserScoring = TRUE; | |
1205 break; | |
1206 | |
1207 case 'n' : | |
1208 Nucleotides_Only = TRUE; | |
1209 break; | |
1210 | |
1211 case 'N' : | |
1212 Consec_Non_ACGT = strtol (optarg, NULL, 10); | |
1213 break; | |
1214 | |
1215 case 'q' : | |
1216 Query_Suffix = optarg; | |
1217 break; | |
1218 | |
1219 case 'r' : | |
1220 Ref_Suffix = optarg; | |
1221 break; | |
1222 | |
1223 case 'S' : | |
1224 Show_Differences = TRUE; | |
1225 break; | |
1226 | |
1227 case 't' : | |
1228 Tag_From_Fasta_Line = TRUE; | |
1229 break; | |
1230 | |
1231 case 'v' : | |
1232 Verbose = strtol (optarg, NULL, 10); | |
1233 break; | |
1234 | |
1235 case 'x' : | |
1236 Output_Cover_Files = FALSE; | |
1237 break; | |
1238 | |
1239 case 'W' : | |
1240 Error_File_Name = optarg; | |
1241 break; | |
1242 | |
1243 case '?' : | |
1244 fprintf (stderr, "Unrecognized option -%c\n", optopt); | |
1245 | |
1246 default : | |
1247 errflg = TRUE; | |
1248 } | |
1249 | |
1250 if (errflg || optind != argc - 3) | |
1251 { | |
1252 Usage (argv [0]); | |
1253 exit (EXIT_FAILURE); | |
1254 } | |
1255 | |
1256 Ref_File_Path = argv [optind ++]; | |
1257 | |
1258 Match_File_Path = argv [optind ++]; | |
1259 | |
1260 Gaps_File_Path = argv [optind ++]; | |
1261 | |
1262 return; | |
1263 } | |
1264 | |
1265 | |
1266 | |
1267 int Prefix_Edit_Dist | |
1268 (char A [], int m, char T [], int n, int Error_Limit, | |
1269 int * A_End, int * T_End, int * Match_To_End, | |
1270 int Delta [MAX_ERRORS], int * Delta_Len, int extending) | |
1271 | |
1272 // Return the minimum number of changes (inserts, deletes, replacements) | |
1273 // needed to match string A [0 .. (m-1)] with a prefix of string | |
1274 // T [0 .. (n-1)] if it's not more than Error_Limit . | |
1275 // Put delta description of alignment in Delta and set | |
1276 // (* Delta_Len) to the number of entries there if it's a complete | |
1277 // match. | |
1278 // Set A_End and T_End to the rightmost positions where the | |
1279 // alignment ended in A and T , respectively. | |
1280 // Set Match_To_End true if the match extended to the end | |
1281 // of at least one string; otherwise, set it false to indicate | |
1282 // a branch point. | |
1283 // extending indicates whether the match is for a fixed region (FALSE) | |
1284 // or is extending off the end of a match as far as possible (TRUE) | |
1285 | |
1286 { | |
1287 double Score, Max_Score; | |
1288 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; | |
1289 int Best_d, Best_e, Longest, Row; | |
1290 int Left, Right; | |
1291 int d, e, i, j, shorter; | |
1292 | |
1293 // assert (m <= n); | |
1294 Best_d = Best_e = Longest = 0; | |
1295 (* Delta_Len) = 0; | |
1296 | |
1297 if (Consec_Non_ACGT > 0) | |
1298 { | |
1299 int ct; | |
1300 | |
1301 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) | |
1302 { | |
1303 switch (A [i]) | |
1304 { | |
1305 case 'a' : | |
1306 case 'c' : | |
1307 case 'g' : | |
1308 case 't' : | |
1309 ct = 0; | |
1310 break; | |
1311 default : | |
1312 ct ++; | |
1313 } | |
1314 } | |
1315 if (ct >= Consec_Non_ACGT) | |
1316 { | |
1317 m = i - ct; | |
1318 extending = TRUE; | |
1319 } | |
1320 | |
1321 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) | |
1322 { | |
1323 switch (T [i]) | |
1324 { | |
1325 case 'a' : | |
1326 case 'c' : | |
1327 case 'g' : | |
1328 case 't' : | |
1329 ct = 0; | |
1330 break; | |
1331 default : | |
1332 ct ++; | |
1333 } | |
1334 } | |
1335 if (ct >= Consec_Non_ACGT) | |
1336 { | |
1337 n = i - ct; | |
1338 extending = TRUE; | |
1339 } | |
1340 } | |
1341 | |
1342 shorter = Min_int (m, n); | |
1343 for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++) | |
1344 ; | |
1345 | |
1346 Edit_Array [0] [0] = Row; | |
1347 | |
1348 if (Row == m && Row == n) // Exact match | |
1349 { | |
1350 (* Delta_Len) = 0; | |
1351 (* A_End) = (* T_End) = Row; | |
1352 (* Match_To_End) = ! extending; | |
1353 return 0; | |
1354 } | |
1355 | |
1356 Left = Right = 0; | |
1357 Max_Score = Row * Branch_Pt_Match_Value; | |
1358 Max_Score_Len = Row; | |
1359 Max_Score_Best_d = 0; | |
1360 Max_Score_Best_e = 0; | |
1361 | |
1362 for (e = 1; e <= Error_Limit; e ++) | |
1363 { | |
1364 int cutoff; | |
1365 | |
1366 Left = Max_int (Left - 1, -e); | |
1367 Right = Min_int (Right + 1, e); | |
1368 Edit_Array [e - 1] [Left] = -2; | |
1369 Edit_Array [e - 1] [Left - 1] = -2; | |
1370 Edit_Array [e - 1] [Right] = -2; | |
1371 Edit_Array [e - 1] [Right + 1] = -2; | |
1372 | |
1373 for (d = Left; d <= Right; d ++) | |
1374 { | |
1375 Row = 1 + Edit_Array [e - 1] [d]; | |
1376 if ((j = Edit_Array [e - 1] [d - 1]) > Row) | |
1377 Row = j; | |
1378 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) | |
1379 Row = j; | |
1380 while (Row < m && Row + d < n | |
1381 && A [Row] == T [Row + d]) | |
1382 Row ++; | |
1383 | |
1384 Edit_Array [e] [d] = Row; | |
1385 | |
1386 if (Row == m && Row + d == n) | |
1387 { | |
1388 if (extending) | |
1389 { | |
1390 for (j = Left; j <= d; j ++) | |
1391 if (Edit_Array [e] [j] > Longest) | |
1392 { | |
1393 Best_d = j; | |
1394 Longest = Edit_Array [e] [j]; | |
1395 } | |
1396 Score = Longest * Branch_Pt_Match_Value - e; | |
1397 if (Score < Max_Score) | |
1398 { | |
1399 (* A_End) = Max_Score_Len; | |
1400 (* T_End) = Max_Score_Len + Max_Score_Best_d; | |
1401 Set_Deltas (Delta, Delta_Len, Max_Score_Len, | |
1402 Max_Score_Best_d, Max_Score_Best_e); | |
1403 (* Match_To_End) = FALSE; | |
1404 return Max_Score_Best_e; | |
1405 } | |
1406 } | |
1407 | |
1408 (* A_End) = Row; // One past last align position | |
1409 (* T_End) = Row + d; | |
1410 Set_Deltas (Delta, Delta_Len, Row, d, e); | |
1411 (* Match_To_End) = ! extending; | |
1412 return e; | |
1413 } | |
1414 } | |
1415 | |
1416 cutoff = Longest - GIVE_UP_LEN; | |
1417 while (Left <= Right && Edit_Array [e] [Left] < cutoff) | |
1418 Left ++; | |
1419 if (Left > Right) | |
1420 break; | |
1421 while (Right > Left && Edit_Array [e] [Right] < cutoff) | |
1422 Right --; | |
1423 | |
1424 assert (Left <= Right); | |
1425 | |
1426 for (d = Left; d <= Right; d ++) | |
1427 if (Edit_Array [e] [d] > Longest) | |
1428 { | |
1429 Best_d = d; | |
1430 Best_e = e; | |
1431 Longest = Edit_Array [e] [d]; | |
1432 } | |
1433 | |
1434 Score = Longest * Branch_Pt_Match_Value - e; | |
1435 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 | |
1436 if (Score > Max_Score) | |
1437 { | |
1438 Max_Score = Score; | |
1439 Max_Score_Len = Longest; | |
1440 Max_Score_Best_d = Best_d; | |
1441 Max_Score_Best_e = Best_e; | |
1442 } | |
1443 | |
1444 if (Longest - Max_Score_Len >= GIVE_UP_LEN) | |
1445 break; | |
1446 } | |
1447 | |
1448 (* A_End) = Max_Score_Len; | |
1449 (* T_End) = Max_Score_Len + Max_Score_Best_d; | |
1450 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); | |
1451 (* Match_To_End) = FALSE; | |
1452 | |
1453 return Max_Score_Best_e; | |
1454 } | |
1455 | |
1456 | |
1457 | |
1458 int Read_String | |
1459 (FILE * fp, char * * T, long int * Size, char header []) | |
1460 | |
1461 /* Read next string from fp (assuming FASTA format) into (* T) [1 ..] | |
1462 * which has Size characters. Allocate extra memory if needed | |
1463 * and adjust Size accordingly. Return TRUE if successful, FALSE | |
1464 * otherwise (e.g., EOF). Set header to the contents of the FASTA | |
1465 * header line. */ | |
1466 | |
1467 { | |
1468 char Line [MAX_LINE]; | |
1469 long int Len; | |
1470 int Ch, Ct; | |
1471 | |
1472 if (feof (fp)) | |
1473 return FALSE; | |
1474 | |
1475 while ((Ch = fgetc (fp)) != EOF && Ch != '>') | |
1476 ; | |
1477 | |
1478 if (Ch != '>') | |
1479 return FALSE; | |
1480 | |
1481 fgets (Line, MAX_LINE, fp); | |
1482 Len = strlen (Line); | |
1483 assert (Len > 0 && Line [Len - 1] == '\n'); | |
1484 Line [Len - 1] = '\0'; | |
1485 strcpy (header, Line); | |
1486 | |
1487 | |
1488 Ct = 0; | |
1489 Len = 0; | |
1490 while ((Ch = fgetc (fp)) != EOF && Ch != '>') | |
1491 { | |
1492 if (isspace (Ch)) | |
1493 continue; | |
1494 | |
1495 Ct ++; | |
1496 | |
1497 if (Ct >= (* Size) - 1) | |
1498 { | |
1499 (* Size) = (long int) ((* Size) * EXPANSION_FACTOR); | |
1500 (* T) = (char *) Safe_realloc ((* T), (* Size)); | |
1501 } | |
1502 Ch = tolower (Ch); | |
1503 if (! isalpha (Ch)) | |
1504 { | |
1505 fprintf (stderr, "Unexpected character `%c\' in string %s\n", | |
1506 Ch, header); | |
1507 Ch = 'x'; | |
1508 } | |
1509 (* T) [++ Len] = Ch; | |
1510 } | |
1511 | |
1512 (* T) [Len + 1] = '\0'; | |
1513 if (Ch == '>') | |
1514 ungetc (Ch, fp); | |
1515 | |
1516 return TRUE; | |
1517 } | |
1518 | |
1519 | |
1520 | |
1521 void Rev_Complement | |
1522 (char * s) | |
1523 | |
1524 /* Set string s to its DNA reverse complement. */ | |
1525 | |
1526 { | |
1527 char ch; | |
1528 int i, j, len; | |
1529 | |
1530 len = strlen (s); | |
1531 | |
1532 for (i = 0, j = len - 1; i < j; i ++, j --) | |
1533 { | |
1534 ch = Complement (s [i]); | |
1535 s [i] = Complement (s [j]); | |
1536 s [j] = ch; | |
1537 } | |
1538 | |
1539 if (i == j) | |
1540 s [i] = Complement (s [i]); | |
1541 | |
1542 return; | |
1543 } | |
1544 | |
1545 | |
1546 | |
1547 void Rev_Display_Alignment | |
1548 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) | |
1549 | |
1550 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] | |
1551 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] | |
1552 // in the reverse direction. | |
1553 | |
1554 { | |
1555 int i, j, k, m, top_len, bottom_len; | |
1556 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
1557 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; | |
1558 | |
1559 i = j = top_len = bottom_len = 0; | |
1560 for (k = 0; k < delta_ct; k ++) | |
1561 { | |
1562 for (m = 1; m < abs (delta [k]); m ++) | |
1563 { | |
1564 top [top_len ++] = a [- i ++]; | |
1565 j ++; | |
1566 } | |
1567 if (delta [k] < 0) | |
1568 { | |
1569 top [top_len ++] = '-'; | |
1570 j ++; | |
1571 } | |
1572 else | |
1573 { | |
1574 top [top_len ++] = a [- i ++]; | |
1575 } | |
1576 } | |
1577 while (i < a_len && j < b_len) | |
1578 { | |
1579 top [top_len ++] = a [- i ++]; | |
1580 j ++; | |
1581 } | |
1582 top [top_len] = '\0'; | |
1583 | |
1584 | |
1585 i = j = 0; | |
1586 for (k = 0; k < delta_ct; k ++) | |
1587 { | |
1588 for (m = 1; m < abs (delta [k]); m ++) | |
1589 { | |
1590 bottom [bottom_len ++] = b [- j ++]; | |
1591 i ++; | |
1592 } | |
1593 if (delta [k] > 0) | |
1594 { | |
1595 bottom [bottom_len ++] = '-'; | |
1596 i ++; | |
1597 } | |
1598 else | |
1599 { | |
1600 bottom [bottom_len ++] = b [- j ++]; | |
1601 } | |
1602 } | |
1603 while (j < b_len && i < a_len) | |
1604 { | |
1605 bottom [bottom_len ++] = b [- j ++]; | |
1606 i ++; | |
1607 } | |
1608 bottom [bottom_len] = '\0'; | |
1609 | |
1610 | |
1611 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) | |
1612 { | |
1613 putchar ('\n'); | |
1614 printf ("A: "); | |
1615 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) | |
1616 putchar (top [i + j]); | |
1617 putchar ('\n'); | |
1618 printf ("B: "); | |
1619 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) | |
1620 putchar (bottom [i + j]); | |
1621 putchar ('\n'); | |
1622 printf (" "); | |
1623 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; | |
1624 j ++) | |
1625 if (top [i + j] != ' ' && bottom [i + j] != ' ' | |
1626 && top [i + j] != bottom [i + j]) | |
1627 putchar ('^'); | |
1628 else | |
1629 putchar (' '); | |
1630 putchar ('\n'); | |
1631 } | |
1632 | |
1633 return; | |
1634 } | |
1635 | |
1636 | |
1637 | |
1638 int Rev_Prefix_Edit_Dist | |
1639 (char A [], int m, char T [], int n, int Error_Limit, | |
1640 int * A_End, int * T_End, int * Match_To_End, | |
1641 int Delta [MAX_ERRORS], int * Delta_Len, int extending) | |
1642 | |
1643 // Return the minimum number of changes (inserts, deletes, replacements) | |
1644 // needed to match string A [0 .. -(m-1)] with a prefix of string | |
1645 // T [0 .. -(n-1)] if it's not more than Error_Limit . | |
1646 // Note: Match is reverse direction, right to left. | |
1647 // Put delta description of alignment in Delta and set | |
1648 // (* Delta_Len) to the number of entries there if it's a complete | |
1649 // match. | |
1650 // Set A_End and T_End to the rightmost positions where the | |
1651 // alignment ended in A and T , respectively. | |
1652 // Set Match_To_End true if the match extended to the end | |
1653 // of at least one string; otherwise, set it false to indicate | |
1654 // a branch point. | |
1655 // extending indicates whether the match is for a fixed region (FALSE) | |
1656 // or is extending off the end of a match as far as possible (TRUE) | |
1657 | |
1658 { | |
1659 double Score, Max_Score; | |
1660 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; | |
1661 int Best_d, Best_e, Longest, Row; | |
1662 int Left, Right; | |
1663 int d, e, i, j, shorter; | |
1664 | |
1665 // assert (m <= n); | |
1666 Best_d = Best_e = Longest = 0; | |
1667 (* Delta_Len) = 0; | |
1668 | |
1669 if (Consec_Non_ACGT > 0) | |
1670 { | |
1671 int ct; | |
1672 | |
1673 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) | |
1674 { | |
1675 switch (A [- i]) | |
1676 { | |
1677 case 'a' : | |
1678 case 'c' : | |
1679 case 'g' : | |
1680 case 't' : | |
1681 ct = 0; | |
1682 break; | |
1683 default : | |
1684 ct ++; | |
1685 } | |
1686 } | |
1687 if (ct >= Consec_Non_ACGT) | |
1688 { | |
1689 m = i - ct; | |
1690 extending = TRUE; | |
1691 } | |
1692 | |
1693 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) | |
1694 { | |
1695 switch (T [- i]) | |
1696 { | |
1697 case 'a' : | |
1698 case 'c' : | |
1699 case 'g' : | |
1700 case 't' : | |
1701 ct = 0; | |
1702 break; | |
1703 default : | |
1704 ct ++; | |
1705 } | |
1706 } | |
1707 if (ct >= Consec_Non_ACGT) | |
1708 { | |
1709 n = i - ct; | |
1710 extending = TRUE; | |
1711 } | |
1712 } | |
1713 | |
1714 shorter = Min_int (m, n); | |
1715 for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++) | |
1716 ; | |
1717 | |
1718 Edit_Array [0] [0] = Row; | |
1719 | |
1720 if (Row == m && Row == n) // Exact match | |
1721 { | |
1722 (* Delta_Len) = 0; | |
1723 (* A_End) = (* T_End) = Row; | |
1724 (* Match_To_End) = ! extending; | |
1725 return 0; | |
1726 } | |
1727 | |
1728 Left = Right = 0; | |
1729 Max_Score = Row * Branch_Pt_Match_Value; | |
1730 Max_Score_Len = Row; | |
1731 Max_Score_Best_d = 0; | |
1732 Max_Score_Best_e = 0; | |
1733 | |
1734 for (e = 1; e <= Error_Limit; e ++) | |
1735 { | |
1736 int cutoff; | |
1737 | |
1738 Left = Max_int (Left - 1, -e); | |
1739 Right = Min_int (Right + 1, e); | |
1740 Edit_Array [e - 1] [Left] = -2; | |
1741 Edit_Array [e - 1] [Left - 1] = -2; | |
1742 Edit_Array [e - 1] [Right] = -2; | |
1743 Edit_Array [e - 1] [Right + 1] = -2; | |
1744 | |
1745 for (d = Left; d <= Right; d ++) | |
1746 { | |
1747 Row = 1 + Edit_Array [e - 1] [d]; | |
1748 if ((j = Edit_Array [e - 1] [d - 1]) > Row) | |
1749 Row = j; | |
1750 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) | |
1751 Row = j; | |
1752 while (Row < m && Row + d < n | |
1753 && A [- Row] == T [- Row - d]) | |
1754 Row ++; | |
1755 | |
1756 Edit_Array [e] [d] = Row; | |
1757 | |
1758 if (Row == m && Row + d == n) | |
1759 { | |
1760 if (extending) | |
1761 { | |
1762 for (j = Left; j <= d; j ++) | |
1763 if (Edit_Array [e] [j] > Longest) | |
1764 { | |
1765 Best_d = j; | |
1766 Longest = Edit_Array [e] [j]; | |
1767 } | |
1768 Score = Longest * Branch_Pt_Match_Value - e; | |
1769 if (Score < Max_Score) | |
1770 { | |
1771 (* A_End) = Max_Score_Len; | |
1772 (* T_End) = Max_Score_Len + Max_Score_Best_d; | |
1773 Set_Deltas (Delta, Delta_Len, Max_Score_Len, | |
1774 Max_Score_Best_d, Max_Score_Best_e); | |
1775 (* Match_To_End) = FALSE; | |
1776 return Max_Score_Best_e; | |
1777 } | |
1778 } | |
1779 | |
1780 (* A_End) = Row; // One past last align position | |
1781 (* T_End) = Row + d; | |
1782 Set_Deltas (Delta, Delta_Len, Row, d, e); | |
1783 (* Match_To_End) = ! extending; | |
1784 return e; | |
1785 } | |
1786 } | |
1787 | |
1788 cutoff = Longest - GIVE_UP_LEN; | |
1789 while (Left <= Right && Edit_Array [e] [Left] < cutoff) | |
1790 Left ++; | |
1791 if (Left > Right) | |
1792 break; | |
1793 while (Right > Left && Edit_Array [e] [Right] < cutoff) | |
1794 Right --; | |
1795 assert (Left <= Right); | |
1796 | |
1797 for (d = Left; d <= Right; d ++) | |
1798 if (Edit_Array [e] [d] > Longest) | |
1799 { | |
1800 Best_d = d; | |
1801 Best_e = e; | |
1802 Longest = Edit_Array [e] [d]; | |
1803 } | |
1804 | |
1805 Score = Longest * Branch_Pt_Match_Value - e; | |
1806 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 | |
1807 if (Score > Max_Score) | |
1808 { | |
1809 Max_Score = Score; | |
1810 Max_Score_Len = Longest; | |
1811 Max_Score_Best_d = Best_d; | |
1812 Max_Score_Best_e = Best_e; | |
1813 } | |
1814 | |
1815 | |
1816 if (Longest - Max_Score_Len >= GIVE_UP_LEN) | |
1817 break; | |
1818 } | |
1819 | |
1820 (* A_End) = Max_Score_Len; | |
1821 (* T_End) = Max_Score_Len + Max_Score_Best_d; | |
1822 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); | |
1823 (* Match_To_End) = FALSE; | |
1824 | |
1825 return Max_Score_Best_e; | |
1826 } | |
1827 | |
1828 | |
1829 | |
1830 void Rev_Show_Diffs | |
1831 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
1832 long int a_ref, long int b_ref) | |
1833 | |
1834 // Show (to stdout ) the differences | |
1835 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] | |
1836 // encoded in delta [0 .. (delta_ct - 1)] | |
1837 // in the reverse direction. a_ref is the position of the beginning | |
1838 // of string a . b_ref is the offset to the start of | |
1839 // string b . | |
1840 | |
1841 { | |
1842 int i, j, k, m; | |
1843 | |
1844 i = j = 0; | |
1845 for (k = 0; k < delta_ct; k ++) | |
1846 { | |
1847 for (m = 1; m < abs (delta [k]); m ++) | |
1848 { | |
1849 if (a [- i] != b [- j]) | |
1850 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); | |
1851 i ++; | |
1852 j ++; | |
1853 } | |
1854 if (delta [k] < 0) | |
1855 { | |
1856 printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]); | |
1857 j ++; | |
1858 } | |
1859 else | |
1860 { | |
1861 printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]); | |
1862 i ++; | |
1863 } | |
1864 } | |
1865 while (i < a_len && j < b_len) | |
1866 { | |
1867 if (a [- i] != b [- j]) | |
1868 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); | |
1869 i ++; | |
1870 j ++; | |
1871 } | |
1872 | |
1873 return; | |
1874 } | |
1875 | |
1876 | |
1877 | |
1878 void Set_Deltas | |
1879 (int delta [], int * delta_len, int row, int d, int e) | |
1880 | |
1881 // Set delta to values that show the alignment recorded in | |
1882 // global Edit_Array . Set (* delta_len) to the number of | |
1883 // entries set. row is the length of the match in the first | |
1884 // string. d is the offset at the end of the match between | |
1885 // the first and second string. e is the number of errors. | |
1886 | |
1887 { | |
1888 int delta_stack [MAX_ERRORS]; | |
1889 int from, last, max; | |
1890 int i, j, k; | |
1891 | |
1892 last = row; | |
1893 (* delta_len) = 0; | |
1894 | |
1895 for (k = e; k > 0; k --) | |
1896 { | |
1897 from = d; | |
1898 max = 1 + Edit_Array [k - 1] [d]; | |
1899 if ((j = Edit_Array [k - 1] [d - 1]) > max) | |
1900 { | |
1901 from = d - 1; | |
1902 max = j; | |
1903 } | |
1904 if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max) | |
1905 { | |
1906 from = d + 1; | |
1907 max = j; | |
1908 } | |
1909 if (from == d - 1) | |
1910 { | |
1911 delta_stack [(* delta_len) ++] = max - last - 1; | |
1912 d --; | |
1913 last = Edit_Array [k - 1] [from]; | |
1914 } | |
1915 else if (from == d + 1) | |
1916 { | |
1917 delta_stack [(* delta_len) ++] = last - (max - 1); | |
1918 d ++; | |
1919 last = Edit_Array [k - 1] [from]; | |
1920 } | |
1921 } | |
1922 delta_stack [(* delta_len) ++] = last + 1; | |
1923 | |
1924 k = 0; | |
1925 for (i = (* delta_len) - 1; i > 0; i --) | |
1926 delta [k ++] | |
1927 = abs (delta_stack [i]) * Sign (delta_stack [i - 1]); | |
1928 (* delta_len) --; | |
1929 | |
1930 return; | |
1931 } | |
1932 | |
1933 | |
1934 | |
1935 static void Set_Left_Pad | |
1936 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len) | |
1937 | |
1938 // Set pad to a triple of strings, each of length pad_len , | |
1939 // representing a MUM of length mum_len ending at r_hi in Ref | |
1940 // and q_hi in Query . Each string in pad must have been allocated | |
1941 // at least pad_len + 1 bytes of memory. The purpose is to serve as | |
1942 // the left-end padding for an alignment with this MUM as its left | |
1943 // boundary. | |
1944 | |
1945 { | |
1946 int i; | |
1947 | |
1948 for (i = 0; i < 3; i ++) | |
1949 pad [i] [pad_len] = '\0'; | |
1950 | |
1951 for (i = 0; i < pad_len; i ++) | |
1952 { | |
1953 pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.'; | |
1954 pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.'; | |
1955 pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' '; | |
1956 } | |
1957 | |
1958 return; | |
1959 } | |
1960 | |
1961 | |
1962 | |
1963 static void Set_Right_Pad | |
1964 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len) | |
1965 | |
1966 // Set pad to a triple of strings, each of length pad_len , | |
1967 // representing a MUM of length mum_len beginning at r_lo in Ref | |
1968 // and q_lo in Query . Each string in pad must have been allocated | |
1969 // at least pad_len + 1 bytes of memory. The purpose is to serve as | |
1970 // the right-end padding for an alignment with this MUM as its right | |
1971 // boundary. | |
1972 | |
1973 { | |
1974 int i; | |
1975 | |
1976 for (i = 0; i < 3; i ++) | |
1977 pad [i] [pad_len] = '\0'; | |
1978 | |
1979 for (i = 0; i < pad_len; i ++) | |
1980 { | |
1981 pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.'; | |
1982 pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.'; | |
1983 pad [2] [i] = i < mum_len ? '=' : ' '; | |
1984 } | |
1985 | |
1986 return; | |
1987 } | |
1988 | |
1989 | |
1990 | |
1991 void Show_Coverage | |
1992 (Cover_t * list, char * filename, char * tag, char * seq) | |
1993 | |
1994 // Print to stderr summary stats of regions in list . | |
1995 // Send to filename a list of region info suitable for celagram . | |
1996 // Use tag to print the headings. Regions refer to seq . | |
1997 | |
1998 { | |
1999 FILE * fp; | |
2000 Cover_t * p; | |
2001 int ct = 0; | |
2002 long int i, prev_hi = 0, n_ct, len, seq_len; | |
2003 long int cov_ns = 0, gap_ns = 0, total_ns; | |
2004 double cov_total = 0.0, gap_total = 0.0; | |
2005 | |
2006 fp = Local_File_Open (filename, "w"); | |
2007 fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n", | |
2008 "Region", "Start", "End", "Len", "N's", "%N's"); | |
2009 | |
2010 for (p = list; p != NULL; p = p -> next) | |
2011 { | |
2012 n_ct = 0; | |
2013 for (i = prev_hi + 1; i < p -> lo; i ++) | |
2014 if (strchr ("acgt", seq [i]) == NULL) | |
2015 n_ct ++; | |
2016 len = p -> lo - prev_hi - 1; | |
2017 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", | |
2018 ct, prev_hi + 1, p -> lo - 1, len, n_ct, | |
2019 len == 0 ? 0.0 : 100.0 * n_ct / len); | |
2020 gap_total += len; | |
2021 gap_ns += n_ct; | |
2022 | |
2023 ct ++; | |
2024 | |
2025 n_ct = 0; | |
2026 for (i = p -> lo; i <= p -> hi; i ++) | |
2027 if (strchr ("acgt", seq [i]) == NULL) | |
2028 n_ct ++; | |
2029 len = 1 + p -> hi - p -> lo; | |
2030 fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", | |
2031 ct, p -> lo, p -> hi, len, n_ct, | |
2032 len == 0 ? 0.0 : 100.0 * n_ct / len); | |
2033 cov_total += len; | |
2034 cov_ns += n_ct; | |
2035 | |
2036 prev_hi = p -> hi; | |
2037 } | |
2038 | |
2039 n_ct = 0; | |
2040 seq_len = strlen (seq + 1); | |
2041 for (i = prev_hi + 1; i <= seq_len; i ++) | |
2042 if (strchr ("acgt", seq [i]) == NULL) | |
2043 n_ct ++; | |
2044 len = seq_len - prev_hi; | |
2045 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", | |
2046 ct, prev_hi + 1, seq_len, len, n_ct, | |
2047 len == 0 ? 0.0 : 100.0 * n_ct / len); | |
2048 gap_total += len; | |
2049 gap_ns += n_ct; | |
2050 | |
2051 total_ns = cov_ns + gap_ns; | |
2052 | |
2053 fclose (fp); | |
2054 | |
2055 fprintf (stderr, "\n%s Sequence Coverage:\n", tag); | |
2056 fprintf (stderr, " Sequence length = %ld\n", seq_len); | |
2057 fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns); | |
2058 fprintf (stderr, " Non acgt's = %ld\n", total_ns); | |
2059 fprintf (stderr, " Number of regions = %d\n", ct); | |
2060 fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", | |
2061 cov_total, | |
2062 seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len, | |
2063 seq_len - total_ns == 0.0 ? 0.0 : | |
2064 100.0 * (cov_total - cov_ns) / (seq_len - total_ns)); | |
2065 fprintf (stderr, " Avg match len = %.0f\n", | |
2066 ct == 0 ? 0.0 : cov_total / ct); | |
2067 fprintf (stderr, " N's in matches = %ld (%.1f%%)\n", | |
2068 cov_ns, | |
2069 cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total); | |
2070 fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", | |
2071 gap_total, | |
2072 seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len, | |
2073 seq_len - total_ns == 0.0 ? 0.0 : | |
2074 100.0 * (gap_total - gap_ns) / (seq_len - total_ns)); | |
2075 fprintf (stderr, " Avg gap len = %.0f\n", | |
2076 gap_total / (1.0 + ct)); | |
2077 fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n", | |
2078 gap_ns, | |
2079 gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total); | |
2080 | |
2081 return; | |
2082 } | |
2083 | |
2084 | |
2085 | |
2086 void Show_Diffs | |
2087 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, | |
2088 long int a_ref, long int b_ref) | |
2089 | |
2090 // Show (to stdout ) the differences | |
2091 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] | |
2092 // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to | |
2093 // the start of string a . b_ref is the offset to the start of | |
2094 // string b . | |
2095 | |
2096 { | |
2097 int i, j, k, m; | |
2098 | |
2099 i = j = 0; | |
2100 for (k = 0; k < delta_ct; k ++) | |
2101 { | |
2102 for (m = 1; m < abs (delta [k]); m ++) | |
2103 { | |
2104 if (a [i] != b [j]) | |
2105 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); | |
2106 i ++; | |
2107 j ++; | |
2108 } | |
2109 if (delta [k] < 0) | |
2110 { | |
2111 printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]); | |
2112 j ++; | |
2113 } | |
2114 else | |
2115 { | |
2116 printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]); | |
2117 i ++; | |
2118 } | |
2119 } | |
2120 while (i < a_len && j < b_len) | |
2121 { | |
2122 if (a [i] != b [j]) | |
2123 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); | |
2124 i ++; | |
2125 j ++; | |
2126 } | |
2127 | |
2128 return; | |
2129 } | |
2130 | |
2131 | |
2132 | |
2133 int Sign | |
2134 (int a) | |
2135 | |
2136 // Return the algebraic sign of a . | |
2137 | |
2138 { | |
2139 if (a > 0) | |
2140 return 1; | |
2141 else if (a < 0) | |
2142 return -1; | |
2143 | |
2144 return 0; | |
2145 } | |
2146 | |
2147 | |
2148 | |
2149 void Usage | |
2150 (char * command) | |
2151 | |
2152 // Print to stderr description of options and command line for | |
2153 // this program. command is the command that was used to | |
2154 // invoke it. | |
2155 | |
2156 { | |
2157 fprintf (stderr, | |
2158 "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n" | |
2159 "\n" | |
2160 "Combines MUMs in <GapsFile> by extending matches off\n" | |
2161 "ends and between MUMs. <RefSequence> is a fasta file\n" | |
2162 "of the reference sequence. <MatchSequences> is a\n" | |
2163 "multi-fasta file of the sequences matched against the\n" | |
2164 "reference\n" | |
2165 "\n" | |
2166 "Options:\n" | |
2167 "-D Only output to stdout the difference positions\n" | |
2168 " and characters\n" | |
2169 "-n Allow matches only between nucleotides, i.e., ACGTs\n" | |
2170 "-N num Break matches at <num> or more consecutive non-ACGTs \n" | |
2171 "-q tag Used to label query match\n" | |
2172 "-r tag Used to label reference match\n" | |
2173 "-S Output all differences in strings\n" | |
2174 "-t Label query matches with query fasta header\n" | |
2175 "-v num Set verbose level for extra output\n" | |
2176 "-W file Reset the default output filename witherrors.gaps\n" | |
2177 "-x Don't output .cover files\n" | |
2178 "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n", | |
2179 | |
2180 command); | |
2181 | |
2182 return; | |
2183 } |