comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/combineMUMs.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 /*************************************************
2 * Module: CombineMUMs.cc
3 * Author: Art Delcher
4 * Description:
5 * Take clusters of MUMs (from mgaps program) and do alignments
6 * to combine them into matches.
7 *
8 */
9
10
11 #include "tigrinc.hh"
12 #include <string>
13 using namespace std;
14
15
16 // Constants
17
18 #define BRANCH_PT_MATCH_VALUE 0.272
19 // Value to add for a match in finding branch points
20 // 1.20 was the calculated value for 6% vs 35% error discrimination
21 // Converting to integers didn't make it faster
22 #define BRANCH_PT_ERROR_VALUE -0.728
23 // Value to add for a mismatch in finding branch points
24 // -2.19 was the calculated value for 6% vs 35% error discrimination
25 // Converting to integers didn't make it faster
26 #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps"
27 // Name of file produced that matches the input gaps file
28 // but with an extra column showing the number of errors in
29 // each gap
30 #define DEFAULT_PAD 10
31 // Default number of matching bases to show at beginning
32 // and end of alignments
33 #define EDIT_DIST_PROB_BOUND 1e-4
34 // Probability limit to "band" edit-distance calculation
35 // Determines NORMAL_DISTRIB_THOLD
36 #define ERRORS_FOR_FREE 1
37 // The number of errors that are ignored in setting probability
38 // bound for terminating alignment extensions in edit distance
39 // calculations
40 #define EXPANSION_FACTOR 1.4
41 // Factor by which to grow memory when realloc'ing
42 #define GIVE_UP_LEN 200
43 // Stop alignment when go this many bases past max-score value
44 #define INITIAL_BUFFER_SIZE 10000
45 // Initial number of bytes to use for sequence strings
46 #define MATCH_SCORE 1.0
47 // Score to add for match in finding length of alignment
48 #define MAX_ERROR_RATE 0.06
49 // The largest error allowed in overlaps
50 #define MAX_FRAG_LEN 1024
51 // The longest fragment allowed
52 #define MAX_ERRORS 500
53 // Most errors in any edit distance computation
54 #define MAX_EXTENSION 10000
55 // Maximum extension match out from a match
56 #define MAX_HDR_LEN 10000
57 // Longest allowable fasta header line
58 #define MAX_MEMORY_STORE 50000
59 // The most fragments allowed in memory store
60 #define MIN_BRANCH_END_DIST 20
61 // Branch points must be at least this many bases from the
62 // end of the fragment to be reported
63 #define MIN_BRANCH_TAIL_SLOPE 0.20
64 // Branch point tails must fall off from the max by at least
65 // this rate
66 #define MIN_MATCH_LEN 40
67 // Number of bases in match region in order to count it
68 #define MISMATCH_SCORE -3.0
69 // Score to add for non-match in finding length of alignment
70 #define NORMAL_DISTRIB_THOLD 3.62
71 // Determined by EDIT_DIST_PROB_BOUND
72 #define REALLY_VERBOSE 0
73 // If 1 prints tons of stuff
74 #define VERBOSE 0
75 // If 1 prints lots of stuff
76
77
78
79 // Type definitions
80
81 typedef struct s_Cover_t
82 {
83 long int lo, hi;
84 struct s_Cover_t * next;
85 } Cover_t;
86
87
88
89 // Globals
90
91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE;
92 // Score used for matches to locate branch points, i.e.,
93 // where alignment score peaks
94 float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE;
95 // Score used for mismatches to locate branch points, i.e.,
96 int Consec_Non_ACGT = 0;
97 // Stop alignments when encounter at least this many non-acgt characters.
98 int * Edit_Array [MAX_ERRORS];
99 // Use for alignment calculation. Points into Edit_Space .
100 int Edit_Match_Limit [MAX_ERRORS] = {0};
101 // This array [e] is the minimum value of Edit_Array [e] [d]
102 // to be worth pursuing in edit-distance computations between guides
103 int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS];
104 // Memory used by alignment calculation
105 int Error_Bound [MAX_FRAG_LEN + 1];
106 // This array [i] is the maximum number of errors allowed
107 // in a match between sequences of length i , which is
108 // i * MAXERROR_RATE .
109 char * Error_File_Name = DEFAULT_ERROR_FILE_NAME;
110 // Name of file to write gaps listing with # errors in each gap
111 int Fill_Ct = 0;
112 // Number of non-acgt bases in ref sequence
113 char * Gaps_File_Path = NULL;
114 // Name of file produced by mgaps program
115 char * Match_File_Path = NULL;
116 // Name of multifasta file of sequences to compare against the reference
117 // sequence
118 int UserScoring = FALSE;
119 // If TRUE, then user specified a percent ID cutoff and scoring
120 // is adjusted to enforce this in extensions
121 int Nucleotides_Only = FALSE;
122 // If TRUE , then only acgt's can match
123 bool Only_Difference_Positions = false;
124 // If true , then output just positions of difference instead of
125 // alignments between exact matches
126 int Output_Cover_Files = TRUE;
127 // If TRUE , output files showing coverage of each genome.
128 double Percent_ID;
129 // Desired identity rate of matches to be found
130 // Expressed as a fraction, not really a percent
131 char * Query = NULL;
132 // The query sequence
133 long int Query_Len;
134 // The length of the query sequence
135 char * Query_Suffix = "Query";
136 // Suffix for query tag
137 char * Ref = NULL;
138 // The reference sequence
139 char * Ref_File_Path = NULL;
140 // Name of (single) fasta file of reference sequence
141 long int Ref_Len;
142 // The length of the reference sequence
143 long int Ref_Size;
144 // The size of the reference sequence buffer
145 char * Ref_Suffix = "Ref";
146 // Suffix for reference tag
147 int Show_Differences = FALSE;
148 // If TRUE then show differences in all alignments
149 int Tag_From_Fasta_Line = FALSE;
150 // If TRUE then use fasta tag from ref & query sequences as
151 // 1st & 2nd column, resp., on match line to identify matches
152 int Verbose = 0;
153 // Controls printing of extra debuggin information
154
155
156 bool Global_Debug_Flag = false;
157
158
159
160 // Functions
161
162 void Add_Coverage
163 (Cover_t * * list, long int lo, long int hi);
164 int Binomial_Bound
165 (int, double, int, double);
166 void Display_Alignment
167 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
168 void Display_Alignment_With_Pad
169 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
170 char * left_pad [3], char * right_pad [3], int pad_len);
171 void Display_Difference_Positions
172 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
173 int b_incr, int delta [], int delta_ct, const char * a_tag,
174 const char * b_tag);
175 int Extend_Backward
176 (long int * ref_lo, long int * query_lo);
177 int Extend_Forward
178 (long int * ref_hi, long int * query_hi);
179 void Initialize_Globals
180 (void);
181 FILE * Local_File_Open
182 (const char * filename, const char * mode);
183 int Max_int
184 (int a, int b);
185 int Min_int
186 (int a, int b);
187 void Parse_Command_Line
188 (int argc, char * argv []);
189 int Prefix_Edit_Dist
190 (char A [], int m, char T [], int n, int Error_Limit,
191 int * A_End, int * T_End, int * Match_To_End,
192 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
193 int Read_String
194 (FILE * fp, char * * T, long int * Size, char header []);
195 void Rev_Complement
196 (char * s);
197 void Rev_Display_Alignment
198 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
199 int Rev_Prefix_Edit_Dist
200 (char A [], int m, char T [], int n, int Error_Limit,
201 int * A_End, int * T_End, int * Match_To_End,
202 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
203 void Rev_Show_Diffs
204 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
205 long int a_ref, long int b_ref);
206 void Set_Deltas
207 (int delta [], int * delta_len, int row, int d, int e);
208 static void Set_Left_Pad
209 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len);
210 static void Set_Right_Pad
211 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len);
212 void Show_Coverage
213 (Cover_t * list, char * filename, char * tag, char * seq);
214 void Show_Diffs
215 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
216 long int a_ref, long int b_ref);
217 int Sign
218 (int a);
219 void Usage
220 (char * command);
221
222
223
224 int main
225 (int argc, char * argv [])
226
227 {
228 FILE * fp, * match_fp, * error_fp;
229 char ref_header [MAX_HDR_LEN];
230 char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN];
231 char line [MAX_LINE], filename [MAX_LINE];
232 char * left_pad [3], * right_pad [3];
233 long int ref_pos, ref_lo, ref_hi;
234 long int query_pos, query_lo, query_hi, query_size;
235 long int match_len, prior_match_len = 0, max_len;
236 double error = 0.0;
237 long int total_errors = 0;
238 int match_ct = 0;
239 double match_total = 0.0;
240 int is_forward = FALSE;
241 int first_match = TRUE;
242 Cover_t * ref_cover_list = NULL;
243 Cover_t * query_cover_list = NULL;
244 char * p;
245 int i;
246
247 Parse_Command_Line (argc, argv);
248
249 if(UserScoring){
250 // percentID = mismatch penalty / (match bonus + mismatch penalty)
251 // or, more compactly, p = mm / (m+mm)
252 // Since we require that m+mm = 1, m=1-mm
253 // and so we get the trivial mm = p
254 //
255 // This means that Branch_Pt_Error_Value = -p
256 // and then Branch_Pt_Match_Value = 1-p
257 //
258 // Make it so:
259 Branch_Pt_Error_Value = - Percent_ID;
260 Branch_Pt_Match_Value = 1.0 - Percent_ID;
261 }
262
263 Initialize_Globals ();
264
265 p = strrchr (Ref_File_Path, '/');
266 if (p == NULL)
267 strcpy (ref_tag, Ref_File_Path);
268 else
269 strcpy (ref_tag, p + 1);
270 p = strrchr (ref_tag, '.');
271 if (p != NULL)
272 (* p) = '\0';
273 strcat (ref_tag, Ref_Suffix);
274
275 p = strrchr (Match_File_Path, '/');
276 if (p == NULL)
277 strcpy (query_tag, Match_File_Path);
278 else
279 strcpy (query_tag, p + 1);
280 p = strrchr (query_tag, '.');
281 if (p != NULL)
282 (* p) = '\0';
283 strcat (query_tag, Query_Suffix);
284
285 fp = Local_File_Open (Ref_File_Path, "r");
286 Ref_Size = 100000;
287 Ref = (char *) Safe_malloc (Ref_Size);
288
289 assert (Read_String (fp, & Ref, & Ref_Size, ref_header));
290 if (Tag_From_Fasta_Line)
291 strcpy (ref_tag, strtok (ref_header, " \t\n>"));
292 fclose (fp);
293
294 Ref_Len = strlen (Ref + 1);
295 Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len);
296 for (i = 1; i <= Ref_Len; i ++)
297 switch (Ref [i])
298 {
299 case 'a' :
300 case 'c' :
301 case 'g' :
302 case 't' :
303 break;
304 default :
305 if (Nucleotides_Only)
306 Ref [i] = '2';
307 Fill_Ct ++;
308 }
309
310 for (i = 0; i < 3; i ++)
311 {
312 left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
313 right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
314 }
315
316 fp = Local_File_Open (Gaps_File_Path, "r");
317 match_fp = Local_File_Open (Match_File_Path, "r");
318 query_size = 100000;
319 Query = (char *) Safe_malloc (query_size);
320
321 error_fp = Local_File_Open (Error_File_Name, "w");
322
323 while (fgets (line, MAX_LINE, fp) != NULL)
324 {
325 int line_len = strlen (line);
326
327 if (line [0] == '>')
328 {
329 if (! first_match)
330 {
331 total_errors += Extend_Forward (& ref_hi, & query_hi);
332 max_len = 1 + ref_hi - ref_lo;
333 if (1 + abs (query_hi - query_lo) > max_len)
334 max_len = 1 + abs (query_hi - query_lo);
335 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
336 if (! Only_Difference_Positions)
337 printf
338 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
339 ref_lo, ref_hi,
340 is_forward ? query_lo : 1 + Query_Len - query_lo,
341 is_forward ? query_hi : 1 + Query_Len - query_hi,
342 total_errors, max_len, 100.0 * error);
343
344 match_ct ++;
345 match_total += 1 + ref_hi - ref_lo;
346 total_errors = 0;
347 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
348 if (is_forward)
349 Add_Coverage (& query_cover_list, query_lo, query_hi);
350 else
351 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
352 1 + Query_Len - query_lo);
353 }
354 if (Tag_From_Fasta_Line)
355 {
356 char buff [MAX_LINE];
357 char * p;
358
359 strcpy (buff, line + 1);
360 p = strtok (buff, " >\t\n");
361 if (strlen (p) > 0)
362 strcpy (query_tag, p);
363 else
364 {
365 fprintf (stderr, "No tag on line %s", line);
366 strcpy (query_tag, "???");
367 }
368 }
369 is_forward = ! is_forward;
370 if (is_forward)
371 {
372 assert (Read_String (match_fp, & Query, & query_size, header));
373 Query_Len = strlen (Query + 1);
374 if (Nucleotides_Only)
375 {
376 for (i = 1; i <= Query_Len; i ++)
377 switch (Query [i])
378 {
379 case 'a' :
380 case 'c' :
381 case 'g' :
382 case 't' :
383 break;
384 default :
385 Query [i] = '1';
386 }
387 }
388 }
389 else
390 Rev_Complement (Query + 1);
391 first_match = TRUE;
392 printf ("%s", line);
393 fprintf (error_fp, "%s", line);
394 }
395 else if (line [0] == '#')
396 {
397 if (! first_match)
398 {
399 total_errors += Extend_Forward (& ref_hi, & query_hi);
400 max_len = 1 + ref_hi - ref_lo;
401 if (1 + abs (query_hi - query_lo) > max_len)
402 max_len = 1 + abs (query_hi - query_lo);
403 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
404 if (! Only_Difference_Positions)
405 printf
406 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
407 ref_lo, ref_hi,
408 is_forward ? query_lo : 1 + Query_Len - query_lo,
409 is_forward ? query_hi : 1 + Query_Len - query_hi,
410 total_errors, max_len, 100.0 * error);
411
412 match_ct ++;
413 match_total += 1 + ref_hi - ref_lo;
414 total_errors = 0;
415 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
416 if (is_forward)
417 Add_Coverage (& query_cover_list, query_lo, query_hi);
418 else
419 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
420 1 + Query_Len - query_lo);
421 }
422 first_match = TRUE;
423 printf ("%s", line);
424 fprintf (error_fp, "%s", line);
425 }
426 else
427 {
428 assert (sscanf (line, "%ld %ld %ld",
429 & ref_pos, & query_pos, & match_len) == 3);
430 if (first_match)
431 {
432 ref_lo = ref_pos;
433 query_lo = query_pos;
434 total_errors += Extend_Backward (& ref_lo, & query_lo);
435 if (! Only_Difference_Positions)
436 printf ("%s", line);
437 if (line_len > 0)
438 line [line_len - 1] = '\0';
439 fprintf (error_fp, "%s %7s\n", line, "-");
440 }
441 else
442 {
443 int errors;
444 int a_len, b_len;
445 int a_end, b_end, match_to_end;
446 int delta [MAX_ERRORS], delta_len;
447
448 a_len = ref_pos - ref_hi - 1;
449 b_len = query_pos - query_hi - 1;
450
451 errors = Prefix_Edit_Dist
452 (Ref + ref_hi + 1, a_len,
453 Query + query_hi + 1, b_len,
454 MAX_ERRORS - 1, & a_end, & b_end,
455 & match_to_end, delta, & delta_len, FALSE);
456 if (Show_Differences)
457 Show_Diffs (Ref + ref_hi + 1, a_end,
458 Query + query_hi + 1, b_end,
459 delta, delta_len, ref_hi + 1, query_hi + 1);
460
461 if (! Only_Difference_Positions)
462 printf ("%s", line);
463
464 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len,
465 DEFAULT_PAD);
466 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len,
467 DEFAULT_PAD);
468
469 if (a_end == 0 && b_end == 0)
470 {
471 if (! Only_Difference_Positions)
472 printf (" No extension\n");
473
474 if (line_len > 0)
475 line [line_len - 1] = '\0';
476 fprintf (error_fp, "%s %7s\n", line, "-");
477 }
478 else if (Only_Difference_Positions)
479 {
480 int q_start, b_incr;
481
482 if (is_forward)
483 {
484 q_start = query_hi + 1;
485 b_incr = 1;
486 }
487 else
488 {
489 q_start = Query_Len - query_hi;
490 b_incr = -1;
491 }
492 Display_Difference_Positions
493 (Ref + ref_hi + 1, ref_hi + 1, a_end,
494 Query + query_hi + 1, q_start, b_end,
495 b_incr, delta, delta_len, ref_tag,
496 query_tag);
497 }
498 else
499 {
500 printf (" Errors = %d\n", errors);
501 if (! Only_Difference_Positions)
502 Display_Alignment_With_Pad
503 (Ref + ref_hi + 1, a_end,
504 Query + query_hi + 1, b_end,
505 delta, delta_len, left_pad, right_pad,
506 DEFAULT_PAD);
507 if (line_len > 0)
508 line [line_len - 1] = '\0';
509 fprintf (error_fp, "%s %7d\n", line, errors);
510 }
511
512 total_errors += errors;
513
514 if (! match_to_end)
515 {
516 ref_hi += a_end;
517 query_hi += b_end;
518 max_len = 1 + ref_hi - ref_lo;
519 if (1 + abs (query_hi - query_lo) > max_len)
520 max_len = 1 + abs (query_hi - query_lo);
521 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
522 if (! Only_Difference_Positions)
523 printf (
524 "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
525 ref_lo, ref_hi,
526 is_forward ? query_lo : 1 + Query_Len - query_lo,
527 is_forward ? query_hi : 1 + Query_Len - query_hi,
528 total_errors, max_len, 100.0 * error);
529
530 match_ct ++;
531 match_total += 1 + ref_hi - ref_lo;
532 total_errors = 0;
533 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
534 if (is_forward)
535 Add_Coverage (& query_cover_list, query_lo, query_hi);
536 else
537 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
538 1 + Query_Len - query_lo);
539 ref_lo = ref_pos;
540 query_lo = query_pos;
541 total_errors += Extend_Backward (& ref_lo, & query_lo);
542 }
543 }
544
545 ref_hi = ref_pos + match_len - 1;
546 query_hi = query_pos + match_len - 1;
547 prior_match_len = match_len;
548 first_match = FALSE;
549 }
550 }
551
552 if (! first_match)
553 {
554 total_errors += Extend_Forward (& ref_hi, & query_hi);
555 max_len = 1 + ref_hi - ref_lo;
556 if (1 + abs (query_hi - query_lo) > max_len)
557 max_len = 1 + abs (query_hi - query_lo);
558 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
559 if (! Only_Difference_Positions)
560 printf
561 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
562 ref_lo, ref_hi,
563 is_forward ? query_lo : 1 + Query_Len - query_lo,
564 is_forward ? query_hi : 1 + Query_Len - query_hi,
565 total_errors, max_len, 100.0 * error);
566
567 match_ct ++;
568 match_total += 1 + ref_hi - ref_lo;
569 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
570 if (is_forward)
571 Add_Coverage (& query_cover_list, query_lo, query_hi);
572 else
573 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
574 1 + Query_Len - query_lo);
575 }
576
577 fprintf (stderr, " Ref len = %ld\n", Ref_Len);
578 fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct);
579 fprintf (stderr, " Non acgt's = %d\n", Fill_Ct);
580 fprintf (stderr, " Number of matches = %d\n", match_ct);
581 fprintf (stderr, "Sum of match bases = %.0f\n", match_total);
582 fprintf (stderr, " Avg match bases = %.0f\n",
583 match_ct == 0 ? 0.0 : match_total / match_ct);
584
585 fclose (error_fp);
586
587 if (Output_Cover_Files)
588 {
589 strcpy (filename, ref_tag);
590 strcat (filename, ".cover");
591 Show_Coverage (ref_cover_list, filename, ref_tag, Ref);
592
593 strcpy (filename, query_tag);
594 strcat (filename, ".cover");
595 Rev_Complement (Query + 1);
596 Show_Coverage (query_cover_list, filename, query_tag, Query);
597 }
598
599 return 0;
600 }
601
602
603
604 void Add_Coverage
605 (Cover_t * * list, long int lo, long int hi)
606
607 // Add lo .. hi to list of regions covered in (* list) .
608 // Combine nodes when appropriate.
609
610 {
611 Cover_t * new_node, * p, * prev = NULL;
612
613 if ((* list) == NULL || hi + 1 < (* list) -> lo)
614 {
615 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
616 new_node -> lo = lo;
617 new_node -> hi = hi;
618 new_node -> next = (* list);
619 (* list) = new_node;
620 return;
621 }
622
623 for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next)
624 prev = p;
625
626 if (p == NULL || hi + 1 < p -> lo)
627 { // insert between or on end
628 assert (prev != NULL);
629 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
630 new_node -> lo = lo;
631 new_node -> hi = hi;
632 new_node -> next = p;
633 prev -> next = new_node;
634 return;
635 }
636
637 if (lo < p -> lo)
638 p -> lo = lo;
639 while (p -> next != NULL && hi + 1 >= p -> next -> lo)
640 {
641 Cover_t * save;
642
643 p -> hi = p -> next -> hi;
644 save = p -> next;
645 p -> next = save -> next;
646 free (save);
647 }
648 if (hi > p -> hi)
649 p -> hi = hi;
650
651 return;
652 }
653
654
655
656 int Binomial_Bound
657 (int e, double p, int Start, double Limit)
658
659 // Return the smallest n >= Start s.t.
660 // prob [>= e errors in n binomial trials (p = error prob)]
661 // > Limit
662
663 {
664 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
665 double q, Sum, P_Power, Q_Power, X;
666 int k, n, Bin_Coeff, Ct;
667
668 q = 1.0 - p;
669 if (Start < e)
670 Start = e;
671
672 for (n = Start; n < MAX_FRAG_LEN; n ++)
673 {
674 if (n <= 35)
675 {
676 Sum = 0.0;
677 Bin_Coeff = 1;
678 Ct = 0;
679 P_Power = 1.0;
680 Q_Power = pow (q, (double) n);
681
682 for (k = 0; k < e && 1.0 - Sum > Limit; k ++)
683 {
684 X = Bin_Coeff * P_Power * Q_Power;
685 Sum += X;
686 Bin_Coeff *= n - Ct;
687 Bin_Coeff /= ++ Ct;
688 P_Power *= p;
689 Q_Power /= q;
690 }
691 if (1.0 - Sum > Limit)
692 return n;
693 }
694 else
695 {
696 Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q));
697 if (Normal_Z <= NORMAL_DISTRIB_THOLD)
698 return n;
699 Sum = 0.0;
700 Mu_Power = 1.0;
701 Factorial = 1.0;
702 Poisson_Coeff = exp (- (double) n * p);
703 for (k = 0; k < e; k ++)
704 {
705 Sum += Mu_Power * Poisson_Coeff / Factorial;
706 Mu_Power *= n * p;
707 Factorial *= k + 1;
708 }
709 if (1.0 - Sum > Limit)
710 return n;
711 }
712 }
713
714 return MAX_FRAG_LEN;
715 }
716
717
718
719 #define DISPLAY_WIDTH 60
720
721 void Display_Alignment
722 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
723
724 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
725 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
726
727 {
728 int i, j, k, m, top_len, bottom_len;
729 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
730 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
731
732 i = j = top_len = bottom_len = 0;
733 for (k = 0; k < delta_ct; k ++)
734 {
735 for (m = 1; m < abs (delta [k]); m ++)
736 {
737 top [top_len ++] = a [i ++];
738 j ++;
739 }
740 if (delta [k] < 0)
741 {
742 top [top_len ++] = '-';
743 j ++;
744 }
745 else
746 {
747 top [top_len ++] = a [i ++];
748 }
749 }
750 while (i < a_len && j < b_len)
751 {
752 top [top_len ++] = a [i ++];
753 j ++;
754 }
755 top [top_len] = '\0';
756
757
758 i = j = 0;
759 for (k = 0; k < delta_ct; k ++)
760 {
761 for (m = 1; m < abs (delta [k]); m ++)
762 {
763 bottom [bottom_len ++] = b [j ++];
764 i ++;
765 }
766 if (delta [k] > 0)
767 {
768 bottom [bottom_len ++] = '-';
769 i ++;
770 }
771 else
772 {
773 bottom [bottom_len ++] = b [j ++];
774 }
775 }
776 while (j < b_len && i < a_len)
777 {
778 bottom [bottom_len ++] = b [j ++];
779 i ++;
780 }
781 bottom [bottom_len] = '\0';
782
783
784 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
785 {
786 printf ("A: ");
787 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
788 putchar (top [i + j]);
789 putchar ('\n');
790 printf ("B: ");
791 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
792 putchar (bottom [i + j]);
793 putchar ('\n');
794 printf (" ");
795 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
796 j ++)
797 if (top [i + j] != ' ' && bottom [i + j] != ' '
798 && top [i + j] != bottom [i + j])
799 putchar ('^');
800 else
801 putchar (' ');
802 putchar ('\n');
803 }
804
805 return;
806 }
807
808
809
810 void Display_Alignment_With_Pad
811 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
812 char * left_pad [3], char * right_pad [3], int pad_len)
813
814 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
815 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
816 // Attach pad strings in left_pad to the beginning of the alignment
817 // and the strings in right_pad to the end of the alignment.
818 // pad_len is the width of the pad strings
819
820 {
821 int i, j, k, m, top_len, bottom_len;
822 #if 0
823 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
824 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
825 #else
826 string top, bottom;
827 #endif
828
829 for (k = 0; k < pad_len; k ++)
830 {
831 top . push_back (left_pad [0] [k]);
832 bottom . push_back (left_pad [1] [k]);
833 }
834
835 i = j = 0;
836 for (k = 0; k < delta_ct; k ++)
837 {
838 for (m = 1; m < abs (delta [k]); m ++)
839 {
840 top . push_back (a [i ++]);
841 j ++;
842 }
843 if (delta [k] < 0)
844 {
845 top . push_back ('-');
846 j ++;
847 }
848 else
849 {
850 top . push_back (a [i ++]);
851 }
852 }
853 while (i < a_len && j < b_len)
854 {
855 top . push_back (a [i ++]);
856 j ++;
857 }
858
859 i = j = 0;
860 for (k = 0; k < delta_ct; k ++)
861 {
862 for (m = 1; m < abs (delta [k]); m ++)
863 {
864 bottom . push_back (b [j ++]);
865 i ++;
866 }
867 if (delta [k] > 0)
868 {
869 bottom . push_back ('-');
870 i ++;
871 }
872 else
873 {
874 bottom . push_back (b [j ++]);
875 }
876 }
877 while (j < b_len && i < a_len)
878 {
879 bottom . push_back (b [j ++]);
880 i ++;
881 }
882
883 for (k = 0; k < pad_len; k ++)
884 {
885 top . push_back (right_pad [0] [k]);
886 bottom . push_back (right_pad [1] [k]);
887 }
888
889 top_len = top . length ();
890 bottom_len = bottom . length ();
891
892 if (top_len != bottom_len)
893 {
894 fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n",
895 top_len, bottom_len);
896 exit (EXIT_FAILURE);
897 }
898
899 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
900 {
901 printf ("A: ");
902 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
903 putchar (top [i + j]);
904 putchar ('\n');
905 printf ("B: ");
906 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
907 putchar (bottom [i + j]);
908 putchar ('\n');
909 printf (" ");
910 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
911 j ++)
912 if (i + j < pad_len)
913 putchar (left_pad [2] [i + j]);
914 else if (top_len - i - j <= pad_len)
915 putchar (right_pad [2] [i + j - (top_len - pad_len)]);
916 else if (top [i + j] != ' ' && bottom [i + j] != ' '
917 && top [i + j] != bottom [i + j])
918 putchar ('^');
919 else
920 putchar (' ');
921 putchar ('\n');
922 }
923
924 return;
925 }
926
927
928
929 void Display_Difference_Positions
930 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
931 int b_incr, int delta [], int delta_ct, const char * a_tag,
932 const char * b_tag)
933
934 // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)]
935 // between strings a [0 .. (a_len - 1)] and
936 // b [0 .. (b_len - 1))] . The positions of the starts of the strings
937 // are at a_start and b_start , respectively, and b_incr indicates
938 // if the b postions increase or decrease. b_incr must be 1 or -1.
939 // Use a_tag and b_tag to label the positions.
940
941 {
942 int i, j, k, m;
943
944 assert (b_incr == 1 || b_incr == -1);
945
946 i = j = 0;
947 for (k = 0; k < delta_ct; k ++)
948 {
949 for (m = 1; m < abs (delta [k]); m ++)
950 {
951 if (a [i] != b [j])
952 printf ("%-15s %-15s %8d %8d %c %c\n",
953 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
954 i ++;
955 j ++;
956 }
957 if (delta [k] < 0)
958 {
959 printf ("%-15s %-15s %8d %8d %c %c\n",
960 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]);
961 j ++;
962 }
963 else
964 {
965 if (b_incr > 0)
966 printf ("%-15s %-15s %8d %8d %c %c\n",
967 a_tag, b_tag, a_start + i, b_start + j, a [i], '-');
968 else
969 printf ("%-15s %-15s %8d %8d %c %c\n",
970 a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-');
971 i ++;
972 }
973 }
974 while (i < a_len && j < b_len)
975 {
976 if (a [i] != b [j])
977 printf ("%-15s %-15s %8d %8d %c %c\n",
978 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
979 i ++;
980 j ++;
981 }
982
983 return;
984 }
985
986
987
988 int Extend_Backward
989 (long int * ref_lo, long int * query_lo)
990
991 // Do edit distance off end of match beginning at (* ref_lo) and
992 // (* query_lo) in the reference and query sequences, resp.,
993 // in the reverse direction.
994 // Return the number of errors in the extension and set (* ref_hi)
995 // and (* query_hi) to the end of the extension.
996
997 {
998 int ct = 0, errors, sum = 0;
999 int a_len, b_len;
1000 int a_end, b_end, match_to_end;
1001 int delta [MAX_ERRORS], delta_len;
1002
1003 do
1004 {
1005 a_len = (* ref_lo) - 1;
1006 if (a_len > MAX_EXTENSION)
1007 a_len = MAX_EXTENSION;
1008 b_len = (* query_lo) - 1;
1009 if (b_len > MAX_EXTENSION)
1010 b_len = MAX_EXTENSION;
1011
1012 errors = Rev_Prefix_Edit_Dist
1013 (Ref + (* ref_lo) - 1, a_len,
1014 Query + (* query_lo) - 1, b_len,
1015 MAX_ERRORS - 1, & a_end, & b_end,
1016 & match_to_end, delta, & delta_len, TRUE);
1017 if (Show_Differences)
1018 Rev_Show_Diffs
1019 (Ref + (* ref_lo - 1), a_end,
1020 Query + (* query_lo - 1), b_end,
1021 delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1);
1022 if (Verbose > 0)
1023 {
1024 printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
1025 ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len,
1026 errors, match_to_end ? 'T' : 'F', a_end, b_end);
1027 Rev_Display_Alignment
1028 (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end,
1029 delta, delta_len);
1030 }
1031
1032 (* ref_lo) -= a_end;
1033 (* query_lo) -= b_end;
1034 sum += errors;
1035 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1036
1037 return sum;
1038 }
1039
1040
1041
1042 int Extend_Forward
1043 (long int * ref_hi, long int * query_hi)
1044
1045 // Do edit distance off end of match ending at (* ref_hi) and
1046 // (* query_hi) in the reference and query sequences, resp.
1047 // Return the number of errors in the extension and set (* ref_hi)
1048 // and (* query_hi) to the end of the extension.
1049
1050 {
1051 int ct = 0, errors, sum = 0;
1052 int a_end, b_end, match_to_end;
1053 int a_len, b_len;
1054 int delta [MAX_ERRORS], delta_len;
1055
1056 do
1057 {
1058 a_len = Ref_Len - (* ref_hi);
1059 if (a_len > MAX_EXTENSION)
1060 a_len = MAX_EXTENSION;
1061 b_len = Query_Len - (* query_hi);
1062 if (b_len > MAX_EXTENSION)
1063 b_len = MAX_EXTENSION;
1064
1065 errors = Prefix_Edit_Dist
1066 (Ref + (* ref_hi) + 1, a_len,
1067 Query + (* query_hi) + 1, b_len,
1068 MAX_ERRORS - 1, & a_end, & b_end,
1069 & match_to_end, delta, & delta_len, TRUE);
1070 if (Show_Differences)
1071 Show_Diffs (Ref + (* ref_hi) + 1, a_end,
1072 Query + (* query_hi) + 1, b_end,
1073 delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1);
1074 if (Verbose > 0)
1075 {
1076 printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
1077 ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len,
1078 errors, match_to_end ? 'T' : 'F', a_end, b_end);
1079 Display_Alignment (Ref + (* ref_hi) + 1, a_end,
1080 Query + (* query_hi) + 1, b_end,
1081 delta, delta_len);
1082 }
1083
1084 (* ref_hi) += a_end;
1085 (* query_hi) += b_end;
1086 sum += errors;
1087 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1088
1089 return sum;
1090 }
1091
1092
1093 void Initialize_Globals
1094 (void)
1095
1096 // Initialize global variables used in this program
1097
1098 {
1099 int i, offset, del;
1100 int e, start;
1101
1102 offset = 2;
1103 del = 6;
1104 for (i = 0; i < MAX_ERRORS; i ++)
1105 {
1106 Edit_Array [i] = Edit_Space + offset;
1107 offset += del;
1108 del += 2;
1109 }
1110
1111
1112 for (i = 0; i <= ERRORS_FOR_FREE; i ++)
1113 Edit_Match_Limit [i] = 0;
1114
1115 start = 1;
1116 for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++)
1117 {
1118 start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE,
1119 start, EDIT_DIST_PROB_BOUND);
1120 Edit_Match_Limit [e] = start - 1;
1121 assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]);
1122 }
1123
1124 for (i = 0; i <= MAX_FRAG_LEN; i ++)
1125 Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE));
1126
1127 return;
1128 }
1129
1130
1131
1132 FILE * Local_File_Open
1133 (const char * filename, const char * mode)
1134
1135 /* Open Filename in Mode and return a pointer to its control
1136 * block. If fail, print a message and exit. */
1137
1138 {
1139 FILE * fp;
1140
1141 fp = fopen (filename, mode);
1142 if (fp == NULL)
1143 {
1144 fprintf (stderr, "ERROR %d: Could not open file %s \n",
1145 errno, filename);
1146 perror (strerror (errno));
1147 exit (EXIT_FAILURE);
1148 }
1149
1150 return fp;
1151 }
1152
1153
1154
1155 int Max_int
1156 (int a, int b)
1157
1158 // Return the larger of a and b .
1159
1160 {
1161 if (a < b)
1162 return b;
1163
1164 return a;
1165 }
1166
1167
1168
1169 int Min_int
1170 (int a, int b)
1171
1172 // Return the smaller of a and b .
1173
1174 {
1175 if (a < b)
1176 return a;
1177
1178 return b;
1179 }
1180
1181
1182
1183 void Parse_Command_Line
1184 (int argc, char * argv [])
1185
1186 // Get options and parameters from command line with argc
1187 // arguments in argv [0 .. (argc - 1)] .
1188
1189 {
1190 int ch, errflg = FALSE;
1191
1192 optarg = NULL;
1193
1194 while (! errflg
1195 && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF))
1196 switch (ch)
1197 {
1198 case 'D' :
1199 Only_Difference_Positions = true;
1200 break;
1201
1202 case 'e' :
1203 Percent_ID = 1.0 - atof (optarg);
1204 UserScoring = TRUE;
1205 break;
1206
1207 case 'n' :
1208 Nucleotides_Only = TRUE;
1209 break;
1210
1211 case 'N' :
1212 Consec_Non_ACGT = strtol (optarg, NULL, 10);
1213 break;
1214
1215 case 'q' :
1216 Query_Suffix = optarg;
1217 break;
1218
1219 case 'r' :
1220 Ref_Suffix = optarg;
1221 break;
1222
1223 case 'S' :
1224 Show_Differences = TRUE;
1225 break;
1226
1227 case 't' :
1228 Tag_From_Fasta_Line = TRUE;
1229 break;
1230
1231 case 'v' :
1232 Verbose = strtol (optarg, NULL, 10);
1233 break;
1234
1235 case 'x' :
1236 Output_Cover_Files = FALSE;
1237 break;
1238
1239 case 'W' :
1240 Error_File_Name = optarg;
1241 break;
1242
1243 case '?' :
1244 fprintf (stderr, "Unrecognized option -%c\n", optopt);
1245
1246 default :
1247 errflg = TRUE;
1248 }
1249
1250 if (errflg || optind != argc - 3)
1251 {
1252 Usage (argv [0]);
1253 exit (EXIT_FAILURE);
1254 }
1255
1256 Ref_File_Path = argv [optind ++];
1257
1258 Match_File_Path = argv [optind ++];
1259
1260 Gaps_File_Path = argv [optind ++];
1261
1262 return;
1263 }
1264
1265
1266
1267 int Prefix_Edit_Dist
1268 (char A [], int m, char T [], int n, int Error_Limit,
1269 int * A_End, int * T_End, int * Match_To_End,
1270 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1271
1272 // Return the minimum number of changes (inserts, deletes, replacements)
1273 // needed to match string A [0 .. (m-1)] with a prefix of string
1274 // T [0 .. (n-1)] if it's not more than Error_Limit .
1275 // Put delta description of alignment in Delta and set
1276 // (* Delta_Len) to the number of entries there if it's a complete
1277 // match.
1278 // Set A_End and T_End to the rightmost positions where the
1279 // alignment ended in A and T , respectively.
1280 // Set Match_To_End true if the match extended to the end
1281 // of at least one string; otherwise, set it false to indicate
1282 // a branch point.
1283 // extending indicates whether the match is for a fixed region (FALSE)
1284 // or is extending off the end of a match as far as possible (TRUE)
1285
1286 {
1287 double Score, Max_Score;
1288 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1289 int Best_d, Best_e, Longest, Row;
1290 int Left, Right;
1291 int d, e, i, j, shorter;
1292
1293 // assert (m <= n);
1294 Best_d = Best_e = Longest = 0;
1295 (* Delta_Len) = 0;
1296
1297 if (Consec_Non_ACGT > 0)
1298 {
1299 int ct;
1300
1301 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
1302 {
1303 switch (A [i])
1304 {
1305 case 'a' :
1306 case 'c' :
1307 case 'g' :
1308 case 't' :
1309 ct = 0;
1310 break;
1311 default :
1312 ct ++;
1313 }
1314 }
1315 if (ct >= Consec_Non_ACGT)
1316 {
1317 m = i - ct;
1318 extending = TRUE;
1319 }
1320
1321 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
1322 {
1323 switch (T [i])
1324 {
1325 case 'a' :
1326 case 'c' :
1327 case 'g' :
1328 case 't' :
1329 ct = 0;
1330 break;
1331 default :
1332 ct ++;
1333 }
1334 }
1335 if (ct >= Consec_Non_ACGT)
1336 {
1337 n = i - ct;
1338 extending = TRUE;
1339 }
1340 }
1341
1342 shorter = Min_int (m, n);
1343 for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++)
1344 ;
1345
1346 Edit_Array [0] [0] = Row;
1347
1348 if (Row == m && Row == n) // Exact match
1349 {
1350 (* Delta_Len) = 0;
1351 (* A_End) = (* T_End) = Row;
1352 (* Match_To_End) = ! extending;
1353 return 0;
1354 }
1355
1356 Left = Right = 0;
1357 Max_Score = Row * Branch_Pt_Match_Value;
1358 Max_Score_Len = Row;
1359 Max_Score_Best_d = 0;
1360 Max_Score_Best_e = 0;
1361
1362 for (e = 1; e <= Error_Limit; e ++)
1363 {
1364 int cutoff;
1365
1366 Left = Max_int (Left - 1, -e);
1367 Right = Min_int (Right + 1, e);
1368 Edit_Array [e - 1] [Left] = -2;
1369 Edit_Array [e - 1] [Left - 1] = -2;
1370 Edit_Array [e - 1] [Right] = -2;
1371 Edit_Array [e - 1] [Right + 1] = -2;
1372
1373 for (d = Left; d <= Right; d ++)
1374 {
1375 Row = 1 + Edit_Array [e - 1] [d];
1376 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
1377 Row = j;
1378 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1379 Row = j;
1380 while (Row < m && Row + d < n
1381 && A [Row] == T [Row + d])
1382 Row ++;
1383
1384 Edit_Array [e] [d] = Row;
1385
1386 if (Row == m && Row + d == n)
1387 {
1388 if (extending)
1389 {
1390 for (j = Left; j <= d; j ++)
1391 if (Edit_Array [e] [j] > Longest)
1392 {
1393 Best_d = j;
1394 Longest = Edit_Array [e] [j];
1395 }
1396 Score = Longest * Branch_Pt_Match_Value - e;
1397 if (Score < Max_Score)
1398 {
1399 (* A_End) = Max_Score_Len;
1400 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1401 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1402 Max_Score_Best_d, Max_Score_Best_e);
1403 (* Match_To_End) = FALSE;
1404 return Max_Score_Best_e;
1405 }
1406 }
1407
1408 (* A_End) = Row; // One past last align position
1409 (* T_End) = Row + d;
1410 Set_Deltas (Delta, Delta_Len, Row, d, e);
1411 (* Match_To_End) = ! extending;
1412 return e;
1413 }
1414 }
1415
1416 cutoff = Longest - GIVE_UP_LEN;
1417 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
1418 Left ++;
1419 if (Left > Right)
1420 break;
1421 while (Right > Left && Edit_Array [e] [Right] < cutoff)
1422 Right --;
1423
1424 assert (Left <= Right);
1425
1426 for (d = Left; d <= Right; d ++)
1427 if (Edit_Array [e] [d] > Longest)
1428 {
1429 Best_d = d;
1430 Best_e = e;
1431 Longest = Edit_Array [e] [d];
1432 }
1433
1434 Score = Longest * Branch_Pt_Match_Value - e;
1435 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1436 if (Score > Max_Score)
1437 {
1438 Max_Score = Score;
1439 Max_Score_Len = Longest;
1440 Max_Score_Best_d = Best_d;
1441 Max_Score_Best_e = Best_e;
1442 }
1443
1444 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
1445 break;
1446 }
1447
1448 (* A_End) = Max_Score_Len;
1449 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1450 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1451 (* Match_To_End) = FALSE;
1452
1453 return Max_Score_Best_e;
1454 }
1455
1456
1457
1458 int Read_String
1459 (FILE * fp, char * * T, long int * Size, char header [])
1460
1461 /* Read next string from fp (assuming FASTA format) into (* T) [1 ..]
1462 * which has Size characters. Allocate extra memory if needed
1463 * and adjust Size accordingly. Return TRUE if successful, FALSE
1464 * otherwise (e.g., EOF). Set header to the contents of the FASTA
1465 * header line. */
1466
1467 {
1468 char Line [MAX_LINE];
1469 long int Len;
1470 int Ch, Ct;
1471
1472 if (feof (fp))
1473 return FALSE;
1474
1475 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
1476 ;
1477
1478 if (Ch != '>')
1479 return FALSE;
1480
1481 fgets (Line, MAX_LINE, fp);
1482 Len = strlen (Line);
1483 assert (Len > 0 && Line [Len - 1] == '\n');
1484 Line [Len - 1] = '\0';
1485 strcpy (header, Line);
1486
1487
1488 Ct = 0;
1489 Len = 0;
1490 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
1491 {
1492 if (isspace (Ch))
1493 continue;
1494
1495 Ct ++;
1496
1497 if (Ct >= (* Size) - 1)
1498 {
1499 (* Size) = (long int) ((* Size) * EXPANSION_FACTOR);
1500 (* T) = (char *) Safe_realloc ((* T), (* Size));
1501 }
1502 Ch = tolower (Ch);
1503 if (! isalpha (Ch))
1504 {
1505 fprintf (stderr, "Unexpected character `%c\' in string %s\n",
1506 Ch, header);
1507 Ch = 'x';
1508 }
1509 (* T) [++ Len] = Ch;
1510 }
1511
1512 (* T) [Len + 1] = '\0';
1513 if (Ch == '>')
1514 ungetc (Ch, fp);
1515
1516 return TRUE;
1517 }
1518
1519
1520
1521 void Rev_Complement
1522 (char * s)
1523
1524 /* Set string s to its DNA reverse complement. */
1525
1526 {
1527 char ch;
1528 int i, j, len;
1529
1530 len = strlen (s);
1531
1532 for (i = 0, j = len - 1; i < j; i ++, j --)
1533 {
1534 ch = Complement (s [i]);
1535 s [i] = Complement (s [j]);
1536 s [j] = ch;
1537 }
1538
1539 if (i == j)
1540 s [i] = Complement (s [i]);
1541
1542 return;
1543 }
1544
1545
1546
1547 void Rev_Display_Alignment
1548 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
1549
1550 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
1551 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
1552 // in the reverse direction.
1553
1554 {
1555 int i, j, k, m, top_len, bottom_len;
1556 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1557 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1558
1559 i = j = top_len = bottom_len = 0;
1560 for (k = 0; k < delta_ct; k ++)
1561 {
1562 for (m = 1; m < abs (delta [k]); m ++)
1563 {
1564 top [top_len ++] = a [- i ++];
1565 j ++;
1566 }
1567 if (delta [k] < 0)
1568 {
1569 top [top_len ++] = '-';
1570 j ++;
1571 }
1572 else
1573 {
1574 top [top_len ++] = a [- i ++];
1575 }
1576 }
1577 while (i < a_len && j < b_len)
1578 {
1579 top [top_len ++] = a [- i ++];
1580 j ++;
1581 }
1582 top [top_len] = '\0';
1583
1584
1585 i = j = 0;
1586 for (k = 0; k < delta_ct; k ++)
1587 {
1588 for (m = 1; m < abs (delta [k]); m ++)
1589 {
1590 bottom [bottom_len ++] = b [- j ++];
1591 i ++;
1592 }
1593 if (delta [k] > 0)
1594 {
1595 bottom [bottom_len ++] = '-';
1596 i ++;
1597 }
1598 else
1599 {
1600 bottom [bottom_len ++] = b [- j ++];
1601 }
1602 }
1603 while (j < b_len && i < a_len)
1604 {
1605 bottom [bottom_len ++] = b [- j ++];
1606 i ++;
1607 }
1608 bottom [bottom_len] = '\0';
1609
1610
1611 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
1612 {
1613 putchar ('\n');
1614 printf ("A: ");
1615 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
1616 putchar (top [i + j]);
1617 putchar ('\n');
1618 printf ("B: ");
1619 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
1620 putchar (bottom [i + j]);
1621 putchar ('\n');
1622 printf (" ");
1623 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
1624 j ++)
1625 if (top [i + j] != ' ' && bottom [i + j] != ' '
1626 && top [i + j] != bottom [i + j])
1627 putchar ('^');
1628 else
1629 putchar (' ');
1630 putchar ('\n');
1631 }
1632
1633 return;
1634 }
1635
1636
1637
1638 int Rev_Prefix_Edit_Dist
1639 (char A [], int m, char T [], int n, int Error_Limit,
1640 int * A_End, int * T_End, int * Match_To_End,
1641 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1642
1643 // Return the minimum number of changes (inserts, deletes, replacements)
1644 // needed to match string A [0 .. -(m-1)] with a prefix of string
1645 // T [0 .. -(n-1)] if it's not more than Error_Limit .
1646 // Note: Match is reverse direction, right to left.
1647 // Put delta description of alignment in Delta and set
1648 // (* Delta_Len) to the number of entries there if it's a complete
1649 // match.
1650 // Set A_End and T_End to the rightmost positions where the
1651 // alignment ended in A and T , respectively.
1652 // Set Match_To_End true if the match extended to the end
1653 // of at least one string; otherwise, set it false to indicate
1654 // a branch point.
1655 // extending indicates whether the match is for a fixed region (FALSE)
1656 // or is extending off the end of a match as far as possible (TRUE)
1657
1658 {
1659 double Score, Max_Score;
1660 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1661 int Best_d, Best_e, Longest, Row;
1662 int Left, Right;
1663 int d, e, i, j, shorter;
1664
1665 // assert (m <= n);
1666 Best_d = Best_e = Longest = 0;
1667 (* Delta_Len) = 0;
1668
1669 if (Consec_Non_ACGT > 0)
1670 {
1671 int ct;
1672
1673 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
1674 {
1675 switch (A [- i])
1676 {
1677 case 'a' :
1678 case 'c' :
1679 case 'g' :
1680 case 't' :
1681 ct = 0;
1682 break;
1683 default :
1684 ct ++;
1685 }
1686 }
1687 if (ct >= Consec_Non_ACGT)
1688 {
1689 m = i - ct;
1690 extending = TRUE;
1691 }
1692
1693 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
1694 {
1695 switch (T [- i])
1696 {
1697 case 'a' :
1698 case 'c' :
1699 case 'g' :
1700 case 't' :
1701 ct = 0;
1702 break;
1703 default :
1704 ct ++;
1705 }
1706 }
1707 if (ct >= Consec_Non_ACGT)
1708 {
1709 n = i - ct;
1710 extending = TRUE;
1711 }
1712 }
1713
1714 shorter = Min_int (m, n);
1715 for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++)
1716 ;
1717
1718 Edit_Array [0] [0] = Row;
1719
1720 if (Row == m && Row == n) // Exact match
1721 {
1722 (* Delta_Len) = 0;
1723 (* A_End) = (* T_End) = Row;
1724 (* Match_To_End) = ! extending;
1725 return 0;
1726 }
1727
1728 Left = Right = 0;
1729 Max_Score = Row * Branch_Pt_Match_Value;
1730 Max_Score_Len = Row;
1731 Max_Score_Best_d = 0;
1732 Max_Score_Best_e = 0;
1733
1734 for (e = 1; e <= Error_Limit; e ++)
1735 {
1736 int cutoff;
1737
1738 Left = Max_int (Left - 1, -e);
1739 Right = Min_int (Right + 1, e);
1740 Edit_Array [e - 1] [Left] = -2;
1741 Edit_Array [e - 1] [Left - 1] = -2;
1742 Edit_Array [e - 1] [Right] = -2;
1743 Edit_Array [e - 1] [Right + 1] = -2;
1744
1745 for (d = Left; d <= Right; d ++)
1746 {
1747 Row = 1 + Edit_Array [e - 1] [d];
1748 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
1749 Row = j;
1750 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1751 Row = j;
1752 while (Row < m && Row + d < n
1753 && A [- Row] == T [- Row - d])
1754 Row ++;
1755
1756 Edit_Array [e] [d] = Row;
1757
1758 if (Row == m && Row + d == n)
1759 {
1760 if (extending)
1761 {
1762 for (j = Left; j <= d; j ++)
1763 if (Edit_Array [e] [j] > Longest)
1764 {
1765 Best_d = j;
1766 Longest = Edit_Array [e] [j];
1767 }
1768 Score = Longest * Branch_Pt_Match_Value - e;
1769 if (Score < Max_Score)
1770 {
1771 (* A_End) = Max_Score_Len;
1772 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1773 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1774 Max_Score_Best_d, Max_Score_Best_e);
1775 (* Match_To_End) = FALSE;
1776 return Max_Score_Best_e;
1777 }
1778 }
1779
1780 (* A_End) = Row; // One past last align position
1781 (* T_End) = Row + d;
1782 Set_Deltas (Delta, Delta_Len, Row, d, e);
1783 (* Match_To_End) = ! extending;
1784 return e;
1785 }
1786 }
1787
1788 cutoff = Longest - GIVE_UP_LEN;
1789 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
1790 Left ++;
1791 if (Left > Right)
1792 break;
1793 while (Right > Left && Edit_Array [e] [Right] < cutoff)
1794 Right --;
1795 assert (Left <= Right);
1796
1797 for (d = Left; d <= Right; d ++)
1798 if (Edit_Array [e] [d] > Longest)
1799 {
1800 Best_d = d;
1801 Best_e = e;
1802 Longest = Edit_Array [e] [d];
1803 }
1804
1805 Score = Longest * Branch_Pt_Match_Value - e;
1806 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1807 if (Score > Max_Score)
1808 {
1809 Max_Score = Score;
1810 Max_Score_Len = Longest;
1811 Max_Score_Best_d = Best_d;
1812 Max_Score_Best_e = Best_e;
1813 }
1814
1815
1816 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
1817 break;
1818 }
1819
1820 (* A_End) = Max_Score_Len;
1821 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1822 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1823 (* Match_To_End) = FALSE;
1824
1825 return Max_Score_Best_e;
1826 }
1827
1828
1829
1830 void Rev_Show_Diffs
1831 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
1832 long int a_ref, long int b_ref)
1833
1834 // Show (to stdout ) the differences
1835 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
1836 // encoded in delta [0 .. (delta_ct - 1)]
1837 // in the reverse direction. a_ref is the position of the beginning
1838 // of string a . b_ref is the offset to the start of
1839 // string b .
1840
1841 {
1842 int i, j, k, m;
1843
1844 i = j = 0;
1845 for (k = 0; k < delta_ct; k ++)
1846 {
1847 for (m = 1; m < abs (delta [k]); m ++)
1848 {
1849 if (a [- i] != b [- j])
1850 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1851 i ++;
1852 j ++;
1853 }
1854 if (delta [k] < 0)
1855 {
1856 printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]);
1857 j ++;
1858 }
1859 else
1860 {
1861 printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]);
1862 i ++;
1863 }
1864 }
1865 while (i < a_len && j < b_len)
1866 {
1867 if (a [- i] != b [- j])
1868 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1869 i ++;
1870 j ++;
1871 }
1872
1873 return;
1874 }
1875
1876
1877
1878 void Set_Deltas
1879 (int delta [], int * delta_len, int row, int d, int e)
1880
1881 // Set delta to values that show the alignment recorded in
1882 // global Edit_Array . Set (* delta_len) to the number of
1883 // entries set. row is the length of the match in the first
1884 // string. d is the offset at the end of the match between
1885 // the first and second string. e is the number of errors.
1886
1887 {
1888 int delta_stack [MAX_ERRORS];
1889 int from, last, max;
1890 int i, j, k;
1891
1892 last = row;
1893 (* delta_len) = 0;
1894
1895 for (k = e; k > 0; k --)
1896 {
1897 from = d;
1898 max = 1 + Edit_Array [k - 1] [d];
1899 if ((j = Edit_Array [k - 1] [d - 1]) > max)
1900 {
1901 from = d - 1;
1902 max = j;
1903 }
1904 if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max)
1905 {
1906 from = d + 1;
1907 max = j;
1908 }
1909 if (from == d - 1)
1910 {
1911 delta_stack [(* delta_len) ++] = max - last - 1;
1912 d --;
1913 last = Edit_Array [k - 1] [from];
1914 }
1915 else if (from == d + 1)
1916 {
1917 delta_stack [(* delta_len) ++] = last - (max - 1);
1918 d ++;
1919 last = Edit_Array [k - 1] [from];
1920 }
1921 }
1922 delta_stack [(* delta_len) ++] = last + 1;
1923
1924 k = 0;
1925 for (i = (* delta_len) - 1; i > 0; i --)
1926 delta [k ++]
1927 = abs (delta_stack [i]) * Sign (delta_stack [i - 1]);
1928 (* delta_len) --;
1929
1930 return;
1931 }
1932
1933
1934
1935 static void Set_Left_Pad
1936 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len)
1937
1938 // Set pad to a triple of strings, each of length pad_len ,
1939 // representing a MUM of length mum_len ending at r_hi in Ref
1940 // and q_hi in Query . Each string in pad must have been allocated
1941 // at least pad_len + 1 bytes of memory. The purpose is to serve as
1942 // the left-end padding for an alignment with this MUM as its left
1943 // boundary.
1944
1945 {
1946 int i;
1947
1948 for (i = 0; i < 3; i ++)
1949 pad [i] [pad_len] = '\0';
1950
1951 for (i = 0; i < pad_len; i ++)
1952 {
1953 pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.';
1954 pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.';
1955 pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' ';
1956 }
1957
1958 return;
1959 }
1960
1961
1962
1963 static void Set_Right_Pad
1964 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len)
1965
1966 // Set pad to a triple of strings, each of length pad_len ,
1967 // representing a MUM of length mum_len beginning at r_lo in Ref
1968 // and q_lo in Query . Each string in pad must have been allocated
1969 // at least pad_len + 1 bytes of memory. The purpose is to serve as
1970 // the right-end padding for an alignment with this MUM as its right
1971 // boundary.
1972
1973 {
1974 int i;
1975
1976 for (i = 0; i < 3; i ++)
1977 pad [i] [pad_len] = '\0';
1978
1979 for (i = 0; i < pad_len; i ++)
1980 {
1981 pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.';
1982 pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.';
1983 pad [2] [i] = i < mum_len ? '=' : ' ';
1984 }
1985
1986 return;
1987 }
1988
1989
1990
1991 void Show_Coverage
1992 (Cover_t * list, char * filename, char * tag, char * seq)
1993
1994 // Print to stderr summary stats of regions in list .
1995 // Send to filename a list of region info suitable for celagram .
1996 // Use tag to print the headings. Regions refer to seq .
1997
1998 {
1999 FILE * fp;
2000 Cover_t * p;
2001 int ct = 0;
2002 long int i, prev_hi = 0, n_ct, len, seq_len;
2003 long int cov_ns = 0, gap_ns = 0, total_ns;
2004 double cov_total = 0.0, gap_total = 0.0;
2005
2006 fp = Local_File_Open (filename, "w");
2007 fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n",
2008 "Region", "Start", "End", "Len", "N's", "%N's");
2009
2010 for (p = list; p != NULL; p = p -> next)
2011 {
2012 n_ct = 0;
2013 for (i = prev_hi + 1; i < p -> lo; i ++)
2014 if (strchr ("acgt", seq [i]) == NULL)
2015 n_ct ++;
2016 len = p -> lo - prev_hi - 1;
2017 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2018 ct, prev_hi + 1, p -> lo - 1, len, n_ct,
2019 len == 0 ? 0.0 : 100.0 * n_ct / len);
2020 gap_total += len;
2021 gap_ns += n_ct;
2022
2023 ct ++;
2024
2025 n_ct = 0;
2026 for (i = p -> lo; i <= p -> hi; i ++)
2027 if (strchr ("acgt", seq [i]) == NULL)
2028 n_ct ++;
2029 len = 1 + p -> hi - p -> lo;
2030 fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2031 ct, p -> lo, p -> hi, len, n_ct,
2032 len == 0 ? 0.0 : 100.0 * n_ct / len);
2033 cov_total += len;
2034 cov_ns += n_ct;
2035
2036 prev_hi = p -> hi;
2037 }
2038
2039 n_ct = 0;
2040 seq_len = strlen (seq + 1);
2041 for (i = prev_hi + 1; i <= seq_len; i ++)
2042 if (strchr ("acgt", seq [i]) == NULL)
2043 n_ct ++;
2044 len = seq_len - prev_hi;
2045 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2046 ct, prev_hi + 1, seq_len, len, n_ct,
2047 len == 0 ? 0.0 : 100.0 * n_ct / len);
2048 gap_total += len;
2049 gap_ns += n_ct;
2050
2051 total_ns = cov_ns + gap_ns;
2052
2053 fclose (fp);
2054
2055 fprintf (stderr, "\n%s Sequence Coverage:\n", tag);
2056 fprintf (stderr, " Sequence length = %ld\n", seq_len);
2057 fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns);
2058 fprintf (stderr, " Non acgt's = %ld\n", total_ns);
2059 fprintf (stderr, " Number of regions = %d\n", ct);
2060 fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
2061 cov_total,
2062 seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len,
2063 seq_len - total_ns == 0.0 ? 0.0 :
2064 100.0 * (cov_total - cov_ns) / (seq_len - total_ns));
2065 fprintf (stderr, " Avg match len = %.0f\n",
2066 ct == 0 ? 0.0 : cov_total / ct);
2067 fprintf (stderr, " N's in matches = %ld (%.1f%%)\n",
2068 cov_ns,
2069 cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total);
2070 fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
2071 gap_total,
2072 seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len,
2073 seq_len - total_ns == 0.0 ? 0.0 :
2074 100.0 * (gap_total - gap_ns) / (seq_len - total_ns));
2075 fprintf (stderr, " Avg gap len = %.0f\n",
2076 gap_total / (1.0 + ct));
2077 fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n",
2078 gap_ns,
2079 gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total);
2080
2081 return;
2082 }
2083
2084
2085
2086 void Show_Diffs
2087 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
2088 long int a_ref, long int b_ref)
2089
2090 // Show (to stdout ) the differences
2091 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
2092 // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to
2093 // the start of string a . b_ref is the offset to the start of
2094 // string b .
2095
2096 {
2097 int i, j, k, m;
2098
2099 i = j = 0;
2100 for (k = 0; k < delta_ct; k ++)
2101 {
2102 for (m = 1; m < abs (delta [k]); m ++)
2103 {
2104 if (a [i] != b [j])
2105 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2106 i ++;
2107 j ++;
2108 }
2109 if (delta [k] < 0)
2110 {
2111 printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]);
2112 j ++;
2113 }
2114 else
2115 {
2116 printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]);
2117 i ++;
2118 }
2119 }
2120 while (i < a_len && j < b_len)
2121 {
2122 if (a [i] != b [j])
2123 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2124 i ++;
2125 j ++;
2126 }
2127
2128 return;
2129 }
2130
2131
2132
2133 int Sign
2134 (int a)
2135
2136 // Return the algebraic sign of a .
2137
2138 {
2139 if (a > 0)
2140 return 1;
2141 else if (a < 0)
2142 return -1;
2143
2144 return 0;
2145 }
2146
2147
2148
2149 void Usage
2150 (char * command)
2151
2152 // Print to stderr description of options and command line for
2153 // this program. command is the command that was used to
2154 // invoke it.
2155
2156 {
2157 fprintf (stderr,
2158 "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n"
2159 "\n"
2160 "Combines MUMs in <GapsFile> by extending matches off\n"
2161 "ends and between MUMs. <RefSequence> is a fasta file\n"
2162 "of the reference sequence. <MatchSequences> is a\n"
2163 "multi-fasta file of the sequences matched against the\n"
2164 "reference\n"
2165 "\n"
2166 "Options:\n"
2167 "-D Only output to stdout the difference positions\n"
2168 " and characters\n"
2169 "-n Allow matches only between nucleotides, i.e., ACGTs\n"
2170 "-N num Break matches at <num> or more consecutive non-ACGTs \n"
2171 "-q tag Used to label query match\n"
2172 "-r tag Used to label reference match\n"
2173 "-S Output all differences in strings\n"
2174 "-t Label query matches with query fasta header\n"
2175 "-v num Set verbose level for extra output\n"
2176 "-W file Reset the default output filename witherrors.gaps\n"
2177 "-x Don't output .cover files\n"
2178 "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n",
2179
2180 command);
2181
2182 return;
2183 }