Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/mgaps.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 /* Programmer: A. Delcher | |
2 * File: mgaps.cc | |
3 * | |
4 * This program reads lists of unique matches between a sequence of strings | |
5 * and a reference string. For each string in the sequence, it clusters | |
6 * the matches together into groups that may represent longer, inexact | |
7 * matches. | |
8 */ | |
9 | |
10 | |
11 #include "tigrinc.hh" | |
12 | |
13 | |
14 const int DEFAULT_FIXED_SEPARATION = 5; | |
15 const long int DEFAULT_MAX_SEPARATION = 1000; | |
16 const long int DEFAULT_MIN_OUTPUT_SCORE = 200; | |
17 const double DEFAULT_SEPARATION_FACTOR = 0.05; | |
18 | |
19 | |
20 struct Match_t | |
21 { | |
22 long int Start1, Start2, Len; | |
23 long int Simple_Score; | |
24 long int Simple_From; | |
25 long int Simple_Adj; | |
26 int cluster_id : 30; | |
27 unsigned int Good : 1; | |
28 unsigned int Tentative : 1; | |
29 }; | |
30 | |
31 | |
32 inline | |
33 long int Max (long int A, long int B) | |
34 | |
35 // Return the larger of A and B . | |
36 | |
37 { | |
38 if (A < B) | |
39 return B; | |
40 else | |
41 return A; | |
42 } | |
43 | |
44 | |
45 static int Check_Labels = FALSE; | |
46 static int Fixed_Separation = DEFAULT_FIXED_SEPARATION; | |
47 static long int Max_Separation = DEFAULT_MAX_SEPARATION; | |
48 static long int Min_Output_Score = DEFAULT_MIN_OUTPUT_SCORE; | |
49 static double Separation_Factor = DEFAULT_SEPARATION_FACTOR; | |
50 static int * UF = NULL; | |
51 static int Use_Extents = FALSE; | |
52 // If TRUE use end minus start as length of cluster instead of | |
53 // sum of component lengths | |
54 | |
55 | |
56 static int By_Start2 | |
57 (const void * A, const void * B); | |
58 static int By_Cluster | |
59 (const void * A, const void * B); | |
60 static void Filter_Matches | |
61 (Match_t * A, int & N); | |
62 static int Find | |
63 (int a); | |
64 static void Parse_Command_Line | |
65 (int argc, char * argv []); | |
66 static void Process_Matches | |
67 (Match_t * A, int N, char * label); | |
68 static int Process_Cluster | |
69 (Match_t * A, int N, char * label); | |
70 static void Union | |
71 (int a, int b); | |
72 static void Usage | |
73 (char * command); | |
74 | |
75 | |
76 | |
77 | |
78 int main | |
79 (int argc, char * argv []) | |
80 | |
81 { | |
82 Match_t * A = NULL; | |
83 char line [MAX_LINE]; | |
84 char save [MAX_LINE]; | |
85 int first = TRUE; | |
86 int header_line_ct = 0; | |
87 long int S1, S2, Len; | |
88 long int N = 0, Size = 0; | |
89 | |
90 Parse_Command_Line (argc, argv); | |
91 | |
92 Size = 500; | |
93 A = (Match_t *) Safe_malloc (Size * sizeof (Match_t)); | |
94 UF = (int *) Safe_malloc (Size * sizeof (int)); | |
95 | |
96 while (fgets (line, MAX_LINE, stdin) != NULL) | |
97 { | |
98 if (line [0] == '>') | |
99 { | |
100 if (first) | |
101 first = FALSE; | |
102 else | |
103 Process_Matches (A, N, save); | |
104 N = 0; | |
105 strcpy (save, line); | |
106 if (Check_Labels && (++ header_line_ct % 2 == 0)) | |
107 assert (strstr (line, "Reverse") != NULL); | |
108 } | |
109 else if (sscanf (line, "%ld %ld %ld", & S1, & S2, & Len) == 3) | |
110 { | |
111 if (N >= Size - 1) | |
112 { | |
113 Size *= 2; | |
114 A = (Match_t *) Safe_realloc (A, Size * sizeof (Match_t)); | |
115 UF = (int *) Safe_realloc (UF, Size * sizeof (int)); | |
116 } | |
117 N ++; | |
118 A [N] . Start1 = S1; | |
119 A [N] . Start2 = S2; | |
120 A [N] . Len = Len; | |
121 A [N] . Good = FALSE; | |
122 A [N] . Tentative = FALSE; | |
123 } | |
124 } | |
125 | |
126 Process_Matches (A, N, save); | |
127 | |
128 | |
129 #if 0 | |
130 printf ("> Other matches\n"); | |
131 Prev = -1; | |
132 for (i = 0; i < N; i ++) | |
133 if (! A [i] . Good) | |
134 { | |
135 if (Prev == -1) | |
136 printf ("%8ld %8ld %6ld %7s %6s %6s\n", | |
137 A [i] . Start1, A [i] . Start2, A [i] . Len, | |
138 "none", "-", "-"); | |
139 else | |
140 { | |
141 if (A [i] . Simple_From == Prev) | |
142 Adj = A [i] . Simple_Adj; | |
143 else | |
144 Adj = 0; | |
145 printf ("%8ld %8ld %6ld", | |
146 A [i] . Start1 + Adj, A [i] . Start2 + Adj, | |
147 A [i] . Len - Adj); | |
148 if (Adj == 0) | |
149 printf (" %7s", "none"); | |
150 else | |
151 printf (" %7ld", - Adj); | |
152 if (A [i] . Simple_From == Prev) | |
153 printf (" %6ld %6ld\n", | |
154 A [i] . Start1 + Adj - A [Prev] . Start1 - A [Prev] . Len, | |
155 A [i] . Start2 + Adj - A [Prev] . Start2 - A [Prev] . Len); | |
156 else | |
157 printf (" %6s %6s\n", "-", "-"); | |
158 } | |
159 Prev = i; | |
160 } | |
161 #endif | |
162 | |
163 return 0; | |
164 } | |
165 | |
166 | |
167 | |
168 static int By_Start2 | |
169 (const void * A, const void * B) | |
170 | |
171 // Return how A and B compare if converted to Match_t | |
172 // based on Start2 value. If Start2 values are equal use | |
173 // Start1 values for comparison. | |
174 | |
175 { | |
176 Match_t * x, * y; | |
177 | |
178 x = (Match_t *) A; | |
179 y = (Match_t *) B; | |
180 | |
181 if (x -> Start2 < y -> Start2) | |
182 return -1; | |
183 else if (x -> Start2 > y -> Start2) | |
184 return 1; | |
185 else if (x -> Start1 < y -> Start1) | |
186 return -1; | |
187 else if (x -> Start1 > y -> Start1) | |
188 return 1; | |
189 else | |
190 return 0; | |
191 } | |
192 | |
193 | |
194 | |
195 static int By_Cluster | |
196 (const void * A, const void * B) | |
197 | |
198 // Return how A and B compare if converted to Match_t | |
199 // first based on cluster_id value, then by Start2 value, | |
200 // then by Start1 value. | |
201 | |
202 { | |
203 Match_t * x, * y; | |
204 | |
205 x = (Match_t *) A; | |
206 y = (Match_t *) B; | |
207 | |
208 if (x -> cluster_id < y -> cluster_id) | |
209 return -1; | |
210 else if (x -> cluster_id > y -> cluster_id) | |
211 return 1; | |
212 else if (x -> Start2 < y -> Start2) | |
213 return -1; | |
214 else if (x -> Start2 > y -> Start2) | |
215 return 1; | |
216 else if (x -> Start1 < y -> Start1) | |
217 return -1; | |
218 else if (x -> Start1 > y -> Start1) | |
219 return 1; | |
220 else | |
221 return 0; | |
222 } | |
223 | |
224 | |
225 | |
226 static void Filter_Matches | |
227 (Match_t * A, int & N) | |
228 | |
229 // Remove from A [0 .. (N - 1)] any matches that are internal to a repeat, | |
230 // e.g., if seq1 has 27 As and seq2 has 20 then the first and | |
231 // last matches will be kept, but the 6 matches in the middle will | |
232 // be eliminated. Also combine overlapping matches on the same | |
233 // diagonal. Pack all remaining matches into the front of A and | |
234 // reduce the value of N if any matches are removed. | |
235 // Matches in A *MUST* be sorted by Start2 value. | |
236 | |
237 { | |
238 int i, j; | |
239 | |
240 for (i = 0; i < N; i ++) | |
241 A [i] . Good = TRUE; | |
242 | |
243 for (i = 0; i < N - 1; i ++) | |
244 { | |
245 int i_diag, i_end; | |
246 | |
247 if (! A [i] . Good) | |
248 continue; | |
249 | |
250 i_diag = A [i] . Start2 - A [i] . Start1; | |
251 i_end = A [i] . Start2 + A [i] . Len; | |
252 | |
253 for (j = i + 1; j < N && A [j] . Start2 <= i_end; j ++) | |
254 { | |
255 int olap; | |
256 int j_diag; | |
257 | |
258 assert (A [i] . Start2 <= A [j] . Start2); | |
259 | |
260 if (! A [j] . Good) | |
261 continue; | |
262 | |
263 j_diag = A [j] . Start2 - A [j] . Start1; | |
264 if (i_diag == j_diag) | |
265 { | |
266 int j_extent; | |
267 | |
268 j_extent = A [j] . Len + A [j] . Start2 - A [i] . Start2; | |
269 if (j_extent > A [i] . Len) | |
270 { | |
271 A [i] . Len = j_extent; | |
272 i_end = A [i] . Start2 + j_extent; | |
273 } | |
274 A [j] . Good = FALSE; | |
275 } | |
276 else if (A [i] . Start1 == A [j] . Start1) | |
277 { | |
278 olap = A [i] . Start2 + A [i] . Len - A [j] . Start2; | |
279 if (A [i] . Len < A [j] . Len) | |
280 { | |
281 if (olap >= A [i] . Len / 2) | |
282 { | |
283 A [i] . Good = FALSE; | |
284 break; | |
285 } | |
286 } | |
287 else if (A [j] . Len < A [i] . Len) | |
288 { | |
289 if (olap >= A [j] . Len / 2) | |
290 { | |
291 A [j] . Good = FALSE; | |
292 } | |
293 } | |
294 else | |
295 { | |
296 if (olap >= A [i] . Len / 2) | |
297 { | |
298 A [j] . Tentative = TRUE; | |
299 if (A [i] . Tentative) | |
300 { | |
301 A [i] . Good = FALSE; | |
302 break; | |
303 } | |
304 } | |
305 } | |
306 } | |
307 else if (A [i] . Start2 == A [j] . Start2) | |
308 { | |
309 olap = A [i] . Start1 + A [i] . Len - A [j] . Start1; | |
310 if (A [i] . Len < A [j] . Len) | |
311 { | |
312 if (olap >= A [i] . Len / 2) | |
313 { | |
314 A [i] . Good = FALSE; | |
315 break; | |
316 } | |
317 } | |
318 else if (A [j] . Len < A [i] . Len) | |
319 { | |
320 if (olap >= A [j] . Len / 2) | |
321 { | |
322 A [j] . Good = FALSE; | |
323 } | |
324 } | |
325 else | |
326 { | |
327 if (olap >= A [i] . Len / 2) | |
328 { | |
329 A [j] . Tentative = TRUE; | |
330 if (A [i] . Tentative) | |
331 { | |
332 A [i] . Good = FALSE; | |
333 break; | |
334 } | |
335 } | |
336 } | |
337 } | |
338 } | |
339 } | |
340 | |
341 for (i = j = 0; i < N; i ++) | |
342 if (A [i] . Good) | |
343 { | |
344 if (i != j) | |
345 A [j] = A [i]; | |
346 j ++; | |
347 } | |
348 N = j; | |
349 | |
350 for (i = 0; i < N; i ++) | |
351 A [i] . Good = FALSE; | |
352 | |
353 return; | |
354 } | |
355 | |
356 | |
357 | |
358 static int Find | |
359 (int a) | |
360 | |
361 // Return the id of the set containing a in UF . | |
362 | |
363 { | |
364 int i, j, k; | |
365 | |
366 if (UF [a] < 0) | |
367 return a; | |
368 | |
369 for (i = a; UF [i] > 0; i = UF [i]) | |
370 ; | |
371 for (j = a; UF [j] != i; j = k) | |
372 { | |
373 k = UF [j]; | |
374 UF [j] = i; | |
375 } | |
376 | |
377 return i; | |
378 } | |
379 | |
380 | |
381 | |
382 static void Parse_Command_Line | |
383 (int argc, char * argv []) | |
384 | |
385 // Get options and parameters from command line with argc | |
386 // arguments in argv [0 .. (argc - 1)] . | |
387 | |
388 { | |
389 int ch, errflg = FALSE; | |
390 char * p; | |
391 | |
392 optarg = NULL; | |
393 | |
394 while (! errflg | |
395 && ((ch = getopt (argc, argv, "Cd:ef:l:s:")) != EOF)) | |
396 switch (ch) | |
397 { | |
398 case 'C' : | |
399 Check_Labels = TRUE; | |
400 break; | |
401 | |
402 case 'd' : | |
403 Fixed_Separation = strtol (optarg, & p, 10); | |
404 break; | |
405 | |
406 case 'e' : | |
407 Use_Extents = TRUE; | |
408 break; | |
409 | |
410 case 'f' : | |
411 Separation_Factor = strtod (optarg, & p); | |
412 break; | |
413 | |
414 case 'l' : | |
415 Min_Output_Score = strtol (optarg, & p, 10); | |
416 break; | |
417 | |
418 case 's' : | |
419 Max_Separation = strtol (optarg, & p, 10); | |
420 break; | |
421 | |
422 case '?' : | |
423 fprintf (stderr, "Unrecognized option -%c\n", optopt); | |
424 | |
425 default : | |
426 errflg = TRUE; | |
427 } | |
428 | |
429 if (errflg || optind != argc) | |
430 { | |
431 Usage (argv [0]); | |
432 exit (EXIT_FAILURE); | |
433 } | |
434 | |
435 return; | |
436 } | |
437 | |
438 | |
439 | |
440 static int Process_Cluster | |
441 (Match_t * A, int N, char * label) | |
442 | |
443 // Process the cluster of matches in A [0 .. (N - 1)] and output them | |
444 // after a line containing label . Return the number of clusters | |
445 // printed. | |
446 | |
447 { | |
448 long int adj, total, hi, lo, extent, score; | |
449 int best, prev; | |
450 int print_ct = 0; | |
451 int i, j, k; | |
452 | |
453 do | |
454 { | |
455 for (i = 0; i < N; i ++) | |
456 { | |
457 A [i] . Simple_Score = A [i] . Len; | |
458 A [i] . Simple_Adj = 0; | |
459 A [i] . Simple_From = -1; | |
460 for (j = 0; j < i; j ++) | |
461 { | |
462 long int Pen; | |
463 long int Olap, Olap1, Olap2; | |
464 | |
465 Olap1 = A [j] . Start1 + A [j] . Len - A [i] . Start1; | |
466 Olap = Max (0, Olap1); | |
467 Olap2 = A [j] . Start2 + A [j] . Len - A [i] . Start2; | |
468 Olap = Max (Olap, Olap2); | |
469 | |
470 // penalize off diagonal matches | |
471 Pen = Olap + abs ( (A [i] . Start2 - A [i] . Start1) - | |
472 (A [j] . Start2 - A [j] . Start1) ); | |
473 | |
474 if (A [j] . Simple_Score + A [i] . Len - Pen > A [i] . Simple_Score) | |
475 { | |
476 A [i] . Simple_From = j; | |
477 A [i] . Simple_Score = A [j] . Simple_Score + A [i] . Len - Pen; | |
478 A [i] . Simple_Adj = Olap; | |
479 } | |
480 } | |
481 } | |
482 | |
483 best = 0; | |
484 for (i = 1; i < N; i ++) | |
485 if (A [i] . Simple_Score > A [best] . Simple_Score) | |
486 best = i; | |
487 total = 0; | |
488 hi = LONG_MIN; | |
489 lo = LONG_MAX; | |
490 for (i = best; i >= 0; i = A [i] . Simple_From) | |
491 { | |
492 A [i] . Good = TRUE; | |
493 total += A [i] . Len; | |
494 if (A [i] . Start1 + A [i] . Len > hi) | |
495 hi = A [i] . Start1 + A [i] . Len; | |
496 if (A [i] . Start1 < lo) | |
497 lo = A [i] . Start1; | |
498 } | |
499 extent = hi - lo; | |
500 | |
501 if (Use_Extents) | |
502 score = extent; | |
503 else | |
504 score = total; | |
505 if (score >= Min_Output_Score) | |
506 { | |
507 print_ct ++; | |
508 fputs (label, stdout); | |
509 prev = -1; | |
510 for (i = 0; i < N; i ++) | |
511 if (A [i] . Good) | |
512 { | |
513 if (prev == -1) | |
514 printf ("%8ld %8ld %6ld %7s %6s %6s\n", | |
515 A [i] . Start1, A [i] . Start2, A [i] . Len, | |
516 "none", "-", "-"); | |
517 else | |
518 { | |
519 adj = A [i] . Simple_Adj; | |
520 printf ("%8ld %8ld %6ld", | |
521 A [i] . Start1 + adj, A [i] . Start2 + adj, | |
522 A [i] . Len - adj); | |
523 if (adj == 0) | |
524 printf (" %7s", "none"); | |
525 else | |
526 printf (" %7ld", - adj); | |
527 printf (" %6ld %6ld\n", | |
528 A [i] . Start1 + adj - A [prev] . Start1 - A [prev] . Len, | |
529 A [i] . Start2 + adj - A [prev] . Start2 - A [prev] . Len); | |
530 } | |
531 prev = i; | |
532 } | |
533 label = "#\n"; | |
534 } | |
535 | |
536 for (i = k = 0; i < N; i ++) | |
537 if (! A [i] . Good) | |
538 { | |
539 if (i != k) | |
540 { | |
541 A [k] = A [i]; | |
542 } | |
543 k ++; | |
544 } | |
545 N = k; | |
546 } while (N > 0); | |
547 | |
548 return print_ct; | |
549 } | |
550 | |
551 | |
552 | |
553 | |
554 static void Process_Matches | |
555 (Match_t * A, int N, char * label) | |
556 | |
557 // Process matches A [1 .. N] and output them after | |
558 // a line containing label . | |
559 | |
560 { | |
561 long int cluster_size, sep; | |
562 int print_ct = 0; | |
563 int a, b, i, j; | |
564 | |
565 if (N <= 0) | |
566 { | |
567 fputs (label, stdout); | |
568 return; | |
569 } | |
570 | |
571 // Use Union-Find to create connected-components based on | |
572 // separation and similar diagonals between matches | |
573 | |
574 for (i = 1; i <= N; i ++) | |
575 UF [i] = -1; | |
576 | |
577 qsort (A + 1, N, sizeof (Match_t), By_Start2); | |
578 | |
579 Filter_Matches (A + 1, N); | |
580 | |
581 for (i = 1; i < N; i ++) | |
582 { | |
583 long int i_end = A [i] . Start2 + A [i] . Len; | |
584 long int i_diag = A [i] . Start2 - A [i] . Start1; | |
585 | |
586 for (j = i + 1; j <= N; j ++) | |
587 { | |
588 long int diag_diff; | |
589 | |
590 sep = A [j] . Start2 - i_end; | |
591 if (sep > Max_Separation) | |
592 break; | |
593 | |
594 diag_diff = abs ((A [j] . Start2 - A [j] . Start1) - i_diag); | |
595 if (diag_diff <= Max (Fixed_Separation, long (Separation_Factor * sep))) | |
596 { | |
597 a = Find (i); | |
598 b = Find (j); | |
599 if (a != b) | |
600 Union (a, b); | |
601 } | |
602 } | |
603 } | |
604 | |
605 // Set the cluster id of each match | |
606 | |
607 for (i = 1; i <= N; i ++) | |
608 A [i] . cluster_id = Find (i); | |
609 | |
610 qsort (A + 1, N, sizeof (Match_t), By_Cluster); | |
611 | |
612 for (i = 1; i <= N; i += cluster_size) | |
613 { | |
614 | |
615 for (j = i + 1; j <= N && A [i] . cluster_id == A [j] . cluster_id; j ++) | |
616 ; | |
617 cluster_size = j - i; | |
618 print_ct += Process_Cluster (A + i, cluster_size, label); | |
619 if (print_ct > 0) | |
620 label = "#\n"; | |
621 } | |
622 | |
623 #if 0 | |
624 // Find the largest cluster | |
625 | |
626 j = 1; | |
627 for (i = 2; i <= N; i ++) | |
628 if (UF [i] < UF [j]) | |
629 j = i; | |
630 | |
631 // j is now the largest cluster | |
632 | |
633 cluster_size = - UF [j]; | |
634 | |
635 for (i = k = 1; i <= N; i ++) | |
636 if (Find (i) == j) | |
637 { | |
638 if (i != k) | |
639 { | |
640 Match_t tmp = A [i]; | |
641 | |
642 A [i] = A [k]; | |
643 A [k] = tmp; | |
644 } | |
645 k ++; | |
646 } | |
647 | |
648 assert (cluster_size == k - 1); | |
649 #endif | |
650 | |
651 if (print_ct == 0) | |
652 fputs (label, stdout); | |
653 | |
654 return; | |
655 } | |
656 | |
657 | |
658 | |
659 static void Union | |
660 (int a, int b) | |
661 | |
662 // Union the sets whose id's are a and b in UF . | |
663 | |
664 { | |
665 assert (UF [a] < 0 && UF [b] < 0); | |
666 | |
667 if (UF [a] < UF [b]) | |
668 { | |
669 UF [a] += UF [b]; | |
670 UF [b] = a; | |
671 } | |
672 else | |
673 { | |
674 UF [b] += UF [a]; | |
675 UF [a] = b; | |
676 } | |
677 | |
678 return; | |
679 } | |
680 | |
681 | |
682 | |
683 static void Usage | |
684 (char * command) | |
685 | |
686 // Print to stderr description of options and command line for | |
687 // this program. command is the command that was used to | |
688 // invoke it. | |
689 | |
690 { | |
691 fprintf (stderr, | |
692 "USAGE: %s [-d <DiagDiff>] [-f <DiagFactor>] [-l <MatchLen>]\n" | |
693 " [-s <MaxSeparation>]\n" | |
694 "\n" | |
695 "Clusters MUMs based on diagonals and separation.\n" | |
696 "Input is from stdin in format produced by mummer.\n" | |
697 "Ouput goes to stdout.\n" | |
698 "\n" | |
699 "Options:\n" | |
700 "-C Check that fasta header labels alternately have \"Reverse\"\n" | |
701 "-d num Fixed diagonal difference to join matches\n" | |
702 "-e Use extent of match (end - start) rather than sum of piece\n" | |
703 " lengths to determine length of cluster\n" | |
704 "-f num Fraction of separation for diagonal difference\n" | |
705 "-l num Minimum length of cluster match\n" | |
706 "-s num Maximum separation between matches in cluster\n", | |
707 command); | |
708 | |
709 return; | |
710 } | |
711 | |
712 | |
713 |