comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/mgaps.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 /* Programmer: A. Delcher
2 * File: mgaps.cc
3 *
4 * This program reads lists of unique matches between a sequence of strings
5 * and a reference string. For each string in the sequence, it clusters
6 * the matches together into groups that may represent longer, inexact
7 * matches.
8 */
9
10
11 #include "tigrinc.hh"
12
13
14 const int DEFAULT_FIXED_SEPARATION = 5;
15 const long int DEFAULT_MAX_SEPARATION = 1000;
16 const long int DEFAULT_MIN_OUTPUT_SCORE = 200;
17 const double DEFAULT_SEPARATION_FACTOR = 0.05;
18
19
20 struct Match_t
21 {
22 long int Start1, Start2, Len;
23 long int Simple_Score;
24 long int Simple_From;
25 long int Simple_Adj;
26 int cluster_id : 30;
27 unsigned int Good : 1;
28 unsigned int Tentative : 1;
29 };
30
31
32 inline
33 long int Max (long int A, long int B)
34
35 // Return the larger of A and B .
36
37 {
38 if (A < B)
39 return B;
40 else
41 return A;
42 }
43
44
45 static int Check_Labels = FALSE;
46 static int Fixed_Separation = DEFAULT_FIXED_SEPARATION;
47 static long int Max_Separation = DEFAULT_MAX_SEPARATION;
48 static long int Min_Output_Score = DEFAULT_MIN_OUTPUT_SCORE;
49 static double Separation_Factor = DEFAULT_SEPARATION_FACTOR;
50 static int * UF = NULL;
51 static int Use_Extents = FALSE;
52 // If TRUE use end minus start as length of cluster instead of
53 // sum of component lengths
54
55
56 static int By_Start2
57 (const void * A, const void * B);
58 static int By_Cluster
59 (const void * A, const void * B);
60 static void Filter_Matches
61 (Match_t * A, int & N);
62 static int Find
63 (int a);
64 static void Parse_Command_Line
65 (int argc, char * argv []);
66 static void Process_Matches
67 (Match_t * A, int N, char * label);
68 static int Process_Cluster
69 (Match_t * A, int N, char * label);
70 static void Union
71 (int a, int b);
72 static void Usage
73 (char * command);
74
75
76
77
78 int main
79 (int argc, char * argv [])
80
81 {
82 Match_t * A = NULL;
83 char line [MAX_LINE];
84 char save [MAX_LINE];
85 int first = TRUE;
86 int header_line_ct = 0;
87 long int S1, S2, Len;
88 long int N = 0, Size = 0;
89
90 Parse_Command_Line (argc, argv);
91
92 Size = 500;
93 A = (Match_t *) Safe_malloc (Size * sizeof (Match_t));
94 UF = (int *) Safe_malloc (Size * sizeof (int));
95
96 while (fgets (line, MAX_LINE, stdin) != NULL)
97 {
98 if (line [0] == '>')
99 {
100 if (first)
101 first = FALSE;
102 else
103 Process_Matches (A, N, save);
104 N = 0;
105 strcpy (save, line);
106 if (Check_Labels && (++ header_line_ct % 2 == 0))
107 assert (strstr (line, "Reverse") != NULL);
108 }
109 else if (sscanf (line, "%ld %ld %ld", & S1, & S2, & Len) == 3)
110 {
111 if (N >= Size - 1)
112 {
113 Size *= 2;
114 A = (Match_t *) Safe_realloc (A, Size * sizeof (Match_t));
115 UF = (int *) Safe_realloc (UF, Size * sizeof (int));
116 }
117 N ++;
118 A [N] . Start1 = S1;
119 A [N] . Start2 = S2;
120 A [N] . Len = Len;
121 A [N] . Good = FALSE;
122 A [N] . Tentative = FALSE;
123 }
124 }
125
126 Process_Matches (A, N, save);
127
128
129 #if 0
130 printf ("> Other matches\n");
131 Prev = -1;
132 for (i = 0; i < N; i ++)
133 if (! A [i] . Good)
134 {
135 if (Prev == -1)
136 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
137 A [i] . Start1, A [i] . Start2, A [i] . Len,
138 "none", "-", "-");
139 else
140 {
141 if (A [i] . Simple_From == Prev)
142 Adj = A [i] . Simple_Adj;
143 else
144 Adj = 0;
145 printf ("%8ld %8ld %6ld",
146 A [i] . Start1 + Adj, A [i] . Start2 + Adj,
147 A [i] . Len - Adj);
148 if (Adj == 0)
149 printf (" %7s", "none");
150 else
151 printf (" %7ld", - Adj);
152 if (A [i] . Simple_From == Prev)
153 printf (" %6ld %6ld\n",
154 A [i] . Start1 + Adj - A [Prev] . Start1 - A [Prev] . Len,
155 A [i] . Start2 + Adj - A [Prev] . Start2 - A [Prev] . Len);
156 else
157 printf (" %6s %6s\n", "-", "-");
158 }
159 Prev = i;
160 }
161 #endif
162
163 return 0;
164 }
165
166
167
168 static int By_Start2
169 (const void * A, const void * B)
170
171 // Return how A and B compare if converted to Match_t
172 // based on Start2 value. If Start2 values are equal use
173 // Start1 values for comparison.
174
175 {
176 Match_t * x, * y;
177
178 x = (Match_t *) A;
179 y = (Match_t *) B;
180
181 if (x -> Start2 < y -> Start2)
182 return -1;
183 else if (x -> Start2 > y -> Start2)
184 return 1;
185 else if (x -> Start1 < y -> Start1)
186 return -1;
187 else if (x -> Start1 > y -> Start1)
188 return 1;
189 else
190 return 0;
191 }
192
193
194
195 static int By_Cluster
196 (const void * A, const void * B)
197
198 // Return how A and B compare if converted to Match_t
199 // first based on cluster_id value, then by Start2 value,
200 // then by Start1 value.
201
202 {
203 Match_t * x, * y;
204
205 x = (Match_t *) A;
206 y = (Match_t *) B;
207
208 if (x -> cluster_id < y -> cluster_id)
209 return -1;
210 else if (x -> cluster_id > y -> cluster_id)
211 return 1;
212 else if (x -> Start2 < y -> Start2)
213 return -1;
214 else if (x -> Start2 > y -> Start2)
215 return 1;
216 else if (x -> Start1 < y -> Start1)
217 return -1;
218 else if (x -> Start1 > y -> Start1)
219 return 1;
220 else
221 return 0;
222 }
223
224
225
226 static void Filter_Matches
227 (Match_t * A, int & N)
228
229 // Remove from A [0 .. (N - 1)] any matches that are internal to a repeat,
230 // e.g., if seq1 has 27 As and seq2 has 20 then the first and
231 // last matches will be kept, but the 6 matches in the middle will
232 // be eliminated. Also combine overlapping matches on the same
233 // diagonal. Pack all remaining matches into the front of A and
234 // reduce the value of N if any matches are removed.
235 // Matches in A *MUST* be sorted by Start2 value.
236
237 {
238 int i, j;
239
240 for (i = 0; i < N; i ++)
241 A [i] . Good = TRUE;
242
243 for (i = 0; i < N - 1; i ++)
244 {
245 int i_diag, i_end;
246
247 if (! A [i] . Good)
248 continue;
249
250 i_diag = A [i] . Start2 - A [i] . Start1;
251 i_end = A [i] . Start2 + A [i] . Len;
252
253 for (j = i + 1; j < N && A [j] . Start2 <= i_end; j ++)
254 {
255 int olap;
256 int j_diag;
257
258 assert (A [i] . Start2 <= A [j] . Start2);
259
260 if (! A [j] . Good)
261 continue;
262
263 j_diag = A [j] . Start2 - A [j] . Start1;
264 if (i_diag == j_diag)
265 {
266 int j_extent;
267
268 j_extent = A [j] . Len + A [j] . Start2 - A [i] . Start2;
269 if (j_extent > A [i] . Len)
270 {
271 A [i] . Len = j_extent;
272 i_end = A [i] . Start2 + j_extent;
273 }
274 A [j] . Good = FALSE;
275 }
276 else if (A [i] . Start1 == A [j] . Start1)
277 {
278 olap = A [i] . Start2 + A [i] . Len - A [j] . Start2;
279 if (A [i] . Len < A [j] . Len)
280 {
281 if (olap >= A [i] . Len / 2)
282 {
283 A [i] . Good = FALSE;
284 break;
285 }
286 }
287 else if (A [j] . Len < A [i] . Len)
288 {
289 if (olap >= A [j] . Len / 2)
290 {
291 A [j] . Good = FALSE;
292 }
293 }
294 else
295 {
296 if (olap >= A [i] . Len / 2)
297 {
298 A [j] . Tentative = TRUE;
299 if (A [i] . Tentative)
300 {
301 A [i] . Good = FALSE;
302 break;
303 }
304 }
305 }
306 }
307 else if (A [i] . Start2 == A [j] . Start2)
308 {
309 olap = A [i] . Start1 + A [i] . Len - A [j] . Start1;
310 if (A [i] . Len < A [j] . Len)
311 {
312 if (olap >= A [i] . Len / 2)
313 {
314 A [i] . Good = FALSE;
315 break;
316 }
317 }
318 else if (A [j] . Len < A [i] . Len)
319 {
320 if (olap >= A [j] . Len / 2)
321 {
322 A [j] . Good = FALSE;
323 }
324 }
325 else
326 {
327 if (olap >= A [i] . Len / 2)
328 {
329 A [j] . Tentative = TRUE;
330 if (A [i] . Tentative)
331 {
332 A [i] . Good = FALSE;
333 break;
334 }
335 }
336 }
337 }
338 }
339 }
340
341 for (i = j = 0; i < N; i ++)
342 if (A [i] . Good)
343 {
344 if (i != j)
345 A [j] = A [i];
346 j ++;
347 }
348 N = j;
349
350 for (i = 0; i < N; i ++)
351 A [i] . Good = FALSE;
352
353 return;
354 }
355
356
357
358 static int Find
359 (int a)
360
361 // Return the id of the set containing a in UF .
362
363 {
364 int i, j, k;
365
366 if (UF [a] < 0)
367 return a;
368
369 for (i = a; UF [i] > 0; i = UF [i])
370 ;
371 for (j = a; UF [j] != i; j = k)
372 {
373 k = UF [j];
374 UF [j] = i;
375 }
376
377 return i;
378 }
379
380
381
382 static void Parse_Command_Line
383 (int argc, char * argv [])
384
385 // Get options and parameters from command line with argc
386 // arguments in argv [0 .. (argc - 1)] .
387
388 {
389 int ch, errflg = FALSE;
390 char * p;
391
392 optarg = NULL;
393
394 while (! errflg
395 && ((ch = getopt (argc, argv, "Cd:ef:l:s:")) != EOF))
396 switch (ch)
397 {
398 case 'C' :
399 Check_Labels = TRUE;
400 break;
401
402 case 'd' :
403 Fixed_Separation = strtol (optarg, & p, 10);
404 break;
405
406 case 'e' :
407 Use_Extents = TRUE;
408 break;
409
410 case 'f' :
411 Separation_Factor = strtod (optarg, & p);
412 break;
413
414 case 'l' :
415 Min_Output_Score = strtol (optarg, & p, 10);
416 break;
417
418 case 's' :
419 Max_Separation = strtol (optarg, & p, 10);
420 break;
421
422 case '?' :
423 fprintf (stderr, "Unrecognized option -%c\n", optopt);
424
425 default :
426 errflg = TRUE;
427 }
428
429 if (errflg || optind != argc)
430 {
431 Usage (argv [0]);
432 exit (EXIT_FAILURE);
433 }
434
435 return;
436 }
437
438
439
440 static int Process_Cluster
441 (Match_t * A, int N, char * label)
442
443 // Process the cluster of matches in A [0 .. (N - 1)] and output them
444 // after a line containing label . Return the number of clusters
445 // printed.
446
447 {
448 long int adj, total, hi, lo, extent, score;
449 int best, prev;
450 int print_ct = 0;
451 int i, j, k;
452
453 do
454 {
455 for (i = 0; i < N; i ++)
456 {
457 A [i] . Simple_Score = A [i] . Len;
458 A [i] . Simple_Adj = 0;
459 A [i] . Simple_From = -1;
460 for (j = 0; j < i; j ++)
461 {
462 long int Pen;
463 long int Olap, Olap1, Olap2;
464
465 Olap1 = A [j] . Start1 + A [j] . Len - A [i] . Start1;
466 Olap = Max (0, Olap1);
467 Olap2 = A [j] . Start2 + A [j] . Len - A [i] . Start2;
468 Olap = Max (Olap, Olap2);
469
470 // penalize off diagonal matches
471 Pen = Olap + abs ( (A [i] . Start2 - A [i] . Start1) -
472 (A [j] . Start2 - A [j] . Start1) );
473
474 if (A [j] . Simple_Score + A [i] . Len - Pen > A [i] . Simple_Score)
475 {
476 A [i] . Simple_From = j;
477 A [i] . Simple_Score = A [j] . Simple_Score + A [i] . Len - Pen;
478 A [i] . Simple_Adj = Olap;
479 }
480 }
481 }
482
483 best = 0;
484 for (i = 1; i < N; i ++)
485 if (A [i] . Simple_Score > A [best] . Simple_Score)
486 best = i;
487 total = 0;
488 hi = LONG_MIN;
489 lo = LONG_MAX;
490 for (i = best; i >= 0; i = A [i] . Simple_From)
491 {
492 A [i] . Good = TRUE;
493 total += A [i] . Len;
494 if (A [i] . Start1 + A [i] . Len > hi)
495 hi = A [i] . Start1 + A [i] . Len;
496 if (A [i] . Start1 < lo)
497 lo = A [i] . Start1;
498 }
499 extent = hi - lo;
500
501 if (Use_Extents)
502 score = extent;
503 else
504 score = total;
505 if (score >= Min_Output_Score)
506 {
507 print_ct ++;
508 fputs (label, stdout);
509 prev = -1;
510 for (i = 0; i < N; i ++)
511 if (A [i] . Good)
512 {
513 if (prev == -1)
514 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
515 A [i] . Start1, A [i] . Start2, A [i] . Len,
516 "none", "-", "-");
517 else
518 {
519 adj = A [i] . Simple_Adj;
520 printf ("%8ld %8ld %6ld",
521 A [i] . Start1 + adj, A [i] . Start2 + adj,
522 A [i] . Len - adj);
523 if (adj == 0)
524 printf (" %7s", "none");
525 else
526 printf (" %7ld", - adj);
527 printf (" %6ld %6ld\n",
528 A [i] . Start1 + adj - A [prev] . Start1 - A [prev] . Len,
529 A [i] . Start2 + adj - A [prev] . Start2 - A [prev] . Len);
530 }
531 prev = i;
532 }
533 label = "#\n";
534 }
535
536 for (i = k = 0; i < N; i ++)
537 if (! A [i] . Good)
538 {
539 if (i != k)
540 {
541 A [k] = A [i];
542 }
543 k ++;
544 }
545 N = k;
546 } while (N > 0);
547
548 return print_ct;
549 }
550
551
552
553
554 static void Process_Matches
555 (Match_t * A, int N, char * label)
556
557 // Process matches A [1 .. N] and output them after
558 // a line containing label .
559
560 {
561 long int cluster_size, sep;
562 int print_ct = 0;
563 int a, b, i, j;
564
565 if (N <= 0)
566 {
567 fputs (label, stdout);
568 return;
569 }
570
571 // Use Union-Find to create connected-components based on
572 // separation and similar diagonals between matches
573
574 for (i = 1; i <= N; i ++)
575 UF [i] = -1;
576
577 qsort (A + 1, N, sizeof (Match_t), By_Start2);
578
579 Filter_Matches (A + 1, N);
580
581 for (i = 1; i < N; i ++)
582 {
583 long int i_end = A [i] . Start2 + A [i] . Len;
584 long int i_diag = A [i] . Start2 - A [i] . Start1;
585
586 for (j = i + 1; j <= N; j ++)
587 {
588 long int diag_diff;
589
590 sep = A [j] . Start2 - i_end;
591 if (sep > Max_Separation)
592 break;
593
594 diag_diff = abs ((A [j] . Start2 - A [j] . Start1) - i_diag);
595 if (diag_diff <= Max (Fixed_Separation, long (Separation_Factor * sep)))
596 {
597 a = Find (i);
598 b = Find (j);
599 if (a != b)
600 Union (a, b);
601 }
602 }
603 }
604
605 // Set the cluster id of each match
606
607 for (i = 1; i <= N; i ++)
608 A [i] . cluster_id = Find (i);
609
610 qsort (A + 1, N, sizeof (Match_t), By_Cluster);
611
612 for (i = 1; i <= N; i += cluster_size)
613 {
614
615 for (j = i + 1; j <= N && A [i] . cluster_id == A [j] . cluster_id; j ++)
616 ;
617 cluster_size = j - i;
618 print_ct += Process_Cluster (A + i, cluster_size, label);
619 if (print_ct > 0)
620 label = "#\n";
621 }
622
623 #if 0
624 // Find the largest cluster
625
626 j = 1;
627 for (i = 2; i <= N; i ++)
628 if (UF [i] < UF [j])
629 j = i;
630
631 // j is now the largest cluster
632
633 cluster_size = - UF [j];
634
635 for (i = k = 1; i <= N; i ++)
636 if (Find (i) == j)
637 {
638 if (i != k)
639 {
640 Match_t tmp = A [i];
641
642 A [i] = A [k];
643 A [k] = tmp;
644 }
645 k ++;
646 }
647
648 assert (cluster_size == k - 1);
649 #endif
650
651 if (print_ct == 0)
652 fputs (label, stdout);
653
654 return;
655 }
656
657
658
659 static void Union
660 (int a, int b)
661
662 // Union the sets whose id's are a and b in UF .
663
664 {
665 assert (UF [a] < 0 && UF [b] < 0);
666
667 if (UF [a] < UF [b])
668 {
669 UF [a] += UF [b];
670 UF [b] = a;
671 }
672 else
673 {
674 UF [b] += UF [a];
675 UF [a] = b;
676 }
677
678 return;
679 }
680
681
682
683 static void Usage
684 (char * command)
685
686 // Print to stderr description of options and command line for
687 // this program. command is the command that was used to
688 // invoke it.
689
690 {
691 fprintf (stderr,
692 "USAGE: %s [-d <DiagDiff>] [-f <DiagFactor>] [-l <MatchLen>]\n"
693 " [-s <MaxSeparation>]\n"
694 "\n"
695 "Clusters MUMs based on diagonals and separation.\n"
696 "Input is from stdin in format produced by mummer.\n"
697 "Ouput goes to stdout.\n"
698 "\n"
699 "Options:\n"
700 "-C Check that fasta header labels alternately have \"Reverse\"\n"
701 "-d num Fixed diagonal difference to join matches\n"
702 "-e Use extent of match (end - start) rather than sum of piece\n"
703 " lengths to determine length of cluster\n"
704 "-f num Fraction of separation for diagonal difference\n"
705 "-l num Minimum length of cluster match\n"
706 "-s num Maximum separation between matches in cluster\n",
707 command);
708
709 return;
710 }
711
712
713