annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/mgaps.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 /* Programmer: A. Delcher
jpayne@69 2 * File: mgaps.cc
jpayne@69 3 *
jpayne@69 4 * This program reads lists of unique matches between a sequence of strings
jpayne@69 5 * and a reference string. For each string in the sequence, it clusters
jpayne@69 6 * the matches together into groups that may represent longer, inexact
jpayne@69 7 * matches.
jpayne@69 8 */
jpayne@69 9
jpayne@69 10
jpayne@69 11 #include "tigrinc.hh"
jpayne@69 12
jpayne@69 13
jpayne@69 14 const int DEFAULT_FIXED_SEPARATION = 5;
jpayne@69 15 const long int DEFAULT_MAX_SEPARATION = 1000;
jpayne@69 16 const long int DEFAULT_MIN_OUTPUT_SCORE = 200;
jpayne@69 17 const double DEFAULT_SEPARATION_FACTOR = 0.05;
jpayne@69 18
jpayne@69 19
jpayne@69 20 struct Match_t
jpayne@69 21 {
jpayne@69 22 long int Start1, Start2, Len;
jpayne@69 23 long int Simple_Score;
jpayne@69 24 long int Simple_From;
jpayne@69 25 long int Simple_Adj;
jpayne@69 26 int cluster_id : 30;
jpayne@69 27 unsigned int Good : 1;
jpayne@69 28 unsigned int Tentative : 1;
jpayne@69 29 };
jpayne@69 30
jpayne@69 31
jpayne@69 32 inline
jpayne@69 33 long int Max (long int A, long int B)
jpayne@69 34
jpayne@69 35 // Return the larger of A and B .
jpayne@69 36
jpayne@69 37 {
jpayne@69 38 if (A < B)
jpayne@69 39 return B;
jpayne@69 40 else
jpayne@69 41 return A;
jpayne@69 42 }
jpayne@69 43
jpayne@69 44
jpayne@69 45 static int Check_Labels = FALSE;
jpayne@69 46 static int Fixed_Separation = DEFAULT_FIXED_SEPARATION;
jpayne@69 47 static long int Max_Separation = DEFAULT_MAX_SEPARATION;
jpayne@69 48 static long int Min_Output_Score = DEFAULT_MIN_OUTPUT_SCORE;
jpayne@69 49 static double Separation_Factor = DEFAULT_SEPARATION_FACTOR;
jpayne@69 50 static int * UF = NULL;
jpayne@69 51 static int Use_Extents = FALSE;
jpayne@69 52 // If TRUE use end minus start as length of cluster instead of
jpayne@69 53 // sum of component lengths
jpayne@69 54
jpayne@69 55
jpayne@69 56 static int By_Start2
jpayne@69 57 (const void * A, const void * B);
jpayne@69 58 static int By_Cluster
jpayne@69 59 (const void * A, const void * B);
jpayne@69 60 static void Filter_Matches
jpayne@69 61 (Match_t * A, int & N);
jpayne@69 62 static int Find
jpayne@69 63 (int a);
jpayne@69 64 static void Parse_Command_Line
jpayne@69 65 (int argc, char * argv []);
jpayne@69 66 static void Process_Matches
jpayne@69 67 (Match_t * A, int N, char * label);
jpayne@69 68 static int Process_Cluster
jpayne@69 69 (Match_t * A, int N, char * label);
jpayne@69 70 static void Union
jpayne@69 71 (int a, int b);
jpayne@69 72 static void Usage
jpayne@69 73 (char * command);
jpayne@69 74
jpayne@69 75
jpayne@69 76
jpayne@69 77
jpayne@69 78 int main
jpayne@69 79 (int argc, char * argv [])
jpayne@69 80
jpayne@69 81 {
jpayne@69 82 Match_t * A = NULL;
jpayne@69 83 char line [MAX_LINE];
jpayne@69 84 char save [MAX_LINE];
jpayne@69 85 int first = TRUE;
jpayne@69 86 int header_line_ct = 0;
jpayne@69 87 long int S1, S2, Len;
jpayne@69 88 long int N = 0, Size = 0;
jpayne@69 89
jpayne@69 90 Parse_Command_Line (argc, argv);
jpayne@69 91
jpayne@69 92 Size = 500;
jpayne@69 93 A = (Match_t *) Safe_malloc (Size * sizeof (Match_t));
jpayne@69 94 UF = (int *) Safe_malloc (Size * sizeof (int));
jpayne@69 95
jpayne@69 96 while (fgets (line, MAX_LINE, stdin) != NULL)
jpayne@69 97 {
jpayne@69 98 if (line [0] == '>')
jpayne@69 99 {
jpayne@69 100 if (first)
jpayne@69 101 first = FALSE;
jpayne@69 102 else
jpayne@69 103 Process_Matches (A, N, save);
jpayne@69 104 N = 0;
jpayne@69 105 strcpy (save, line);
jpayne@69 106 if (Check_Labels && (++ header_line_ct % 2 == 0))
jpayne@69 107 assert (strstr (line, "Reverse") != NULL);
jpayne@69 108 }
jpayne@69 109 else if (sscanf (line, "%ld %ld %ld", & S1, & S2, & Len) == 3)
jpayne@69 110 {
jpayne@69 111 if (N >= Size - 1)
jpayne@69 112 {
jpayne@69 113 Size *= 2;
jpayne@69 114 A = (Match_t *) Safe_realloc (A, Size * sizeof (Match_t));
jpayne@69 115 UF = (int *) Safe_realloc (UF, Size * sizeof (int));
jpayne@69 116 }
jpayne@69 117 N ++;
jpayne@69 118 A [N] . Start1 = S1;
jpayne@69 119 A [N] . Start2 = S2;
jpayne@69 120 A [N] . Len = Len;
jpayne@69 121 A [N] . Good = FALSE;
jpayne@69 122 A [N] . Tentative = FALSE;
jpayne@69 123 }
jpayne@69 124 }
jpayne@69 125
jpayne@69 126 Process_Matches (A, N, save);
jpayne@69 127
jpayne@69 128
jpayne@69 129 #if 0
jpayne@69 130 printf ("> Other matches\n");
jpayne@69 131 Prev = -1;
jpayne@69 132 for (i = 0; i < N; i ++)
jpayne@69 133 if (! A [i] . Good)
jpayne@69 134 {
jpayne@69 135 if (Prev == -1)
jpayne@69 136 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
jpayne@69 137 A [i] . Start1, A [i] . Start2, A [i] . Len,
jpayne@69 138 "none", "-", "-");
jpayne@69 139 else
jpayne@69 140 {
jpayne@69 141 if (A [i] . Simple_From == Prev)
jpayne@69 142 Adj = A [i] . Simple_Adj;
jpayne@69 143 else
jpayne@69 144 Adj = 0;
jpayne@69 145 printf ("%8ld %8ld %6ld",
jpayne@69 146 A [i] . Start1 + Adj, A [i] . Start2 + Adj,
jpayne@69 147 A [i] . Len - Adj);
jpayne@69 148 if (Adj == 0)
jpayne@69 149 printf (" %7s", "none");
jpayne@69 150 else
jpayne@69 151 printf (" %7ld", - Adj);
jpayne@69 152 if (A [i] . Simple_From == Prev)
jpayne@69 153 printf (" %6ld %6ld\n",
jpayne@69 154 A [i] . Start1 + Adj - A [Prev] . Start1 - A [Prev] . Len,
jpayne@69 155 A [i] . Start2 + Adj - A [Prev] . Start2 - A [Prev] . Len);
jpayne@69 156 else
jpayne@69 157 printf (" %6s %6s\n", "-", "-");
jpayne@69 158 }
jpayne@69 159 Prev = i;
jpayne@69 160 }
jpayne@69 161 #endif
jpayne@69 162
jpayne@69 163 return 0;
jpayne@69 164 }
jpayne@69 165
jpayne@69 166
jpayne@69 167
jpayne@69 168 static int By_Start2
jpayne@69 169 (const void * A, const void * B)
jpayne@69 170
jpayne@69 171 // Return how A and B compare if converted to Match_t
jpayne@69 172 // based on Start2 value. If Start2 values are equal use
jpayne@69 173 // Start1 values for comparison.
jpayne@69 174
jpayne@69 175 {
jpayne@69 176 Match_t * x, * y;
jpayne@69 177
jpayne@69 178 x = (Match_t *) A;
jpayne@69 179 y = (Match_t *) B;
jpayne@69 180
jpayne@69 181 if (x -> Start2 < y -> Start2)
jpayne@69 182 return -1;
jpayne@69 183 else if (x -> Start2 > y -> Start2)
jpayne@69 184 return 1;
jpayne@69 185 else if (x -> Start1 < y -> Start1)
jpayne@69 186 return -1;
jpayne@69 187 else if (x -> Start1 > y -> Start1)
jpayne@69 188 return 1;
jpayne@69 189 else
jpayne@69 190 return 0;
jpayne@69 191 }
jpayne@69 192
jpayne@69 193
jpayne@69 194
jpayne@69 195 static int By_Cluster
jpayne@69 196 (const void * A, const void * B)
jpayne@69 197
jpayne@69 198 // Return how A and B compare if converted to Match_t
jpayne@69 199 // first based on cluster_id value, then by Start2 value,
jpayne@69 200 // then by Start1 value.
jpayne@69 201
jpayne@69 202 {
jpayne@69 203 Match_t * x, * y;
jpayne@69 204
jpayne@69 205 x = (Match_t *) A;
jpayne@69 206 y = (Match_t *) B;
jpayne@69 207
jpayne@69 208 if (x -> cluster_id < y -> cluster_id)
jpayne@69 209 return -1;
jpayne@69 210 else if (x -> cluster_id > y -> cluster_id)
jpayne@69 211 return 1;
jpayne@69 212 else if (x -> Start2 < y -> Start2)
jpayne@69 213 return -1;
jpayne@69 214 else if (x -> Start2 > y -> Start2)
jpayne@69 215 return 1;
jpayne@69 216 else if (x -> Start1 < y -> Start1)
jpayne@69 217 return -1;
jpayne@69 218 else if (x -> Start1 > y -> Start1)
jpayne@69 219 return 1;
jpayne@69 220 else
jpayne@69 221 return 0;
jpayne@69 222 }
jpayne@69 223
jpayne@69 224
jpayne@69 225
jpayne@69 226 static void Filter_Matches
jpayne@69 227 (Match_t * A, int & N)
jpayne@69 228
jpayne@69 229 // Remove from A [0 .. (N - 1)] any matches that are internal to a repeat,
jpayne@69 230 // e.g., if seq1 has 27 As and seq2 has 20 then the first and
jpayne@69 231 // last matches will be kept, but the 6 matches in the middle will
jpayne@69 232 // be eliminated. Also combine overlapping matches on the same
jpayne@69 233 // diagonal. Pack all remaining matches into the front of A and
jpayne@69 234 // reduce the value of N if any matches are removed.
jpayne@69 235 // Matches in A *MUST* be sorted by Start2 value.
jpayne@69 236
jpayne@69 237 {
jpayne@69 238 int i, j;
jpayne@69 239
jpayne@69 240 for (i = 0; i < N; i ++)
jpayne@69 241 A [i] . Good = TRUE;
jpayne@69 242
jpayne@69 243 for (i = 0; i < N - 1; i ++)
jpayne@69 244 {
jpayne@69 245 int i_diag, i_end;
jpayne@69 246
jpayne@69 247 if (! A [i] . Good)
jpayne@69 248 continue;
jpayne@69 249
jpayne@69 250 i_diag = A [i] . Start2 - A [i] . Start1;
jpayne@69 251 i_end = A [i] . Start2 + A [i] . Len;
jpayne@69 252
jpayne@69 253 for (j = i + 1; j < N && A [j] . Start2 <= i_end; j ++)
jpayne@69 254 {
jpayne@69 255 int olap;
jpayne@69 256 int j_diag;
jpayne@69 257
jpayne@69 258 assert (A [i] . Start2 <= A [j] . Start2);
jpayne@69 259
jpayne@69 260 if (! A [j] . Good)
jpayne@69 261 continue;
jpayne@69 262
jpayne@69 263 j_diag = A [j] . Start2 - A [j] . Start1;
jpayne@69 264 if (i_diag == j_diag)
jpayne@69 265 {
jpayne@69 266 int j_extent;
jpayne@69 267
jpayne@69 268 j_extent = A [j] . Len + A [j] . Start2 - A [i] . Start2;
jpayne@69 269 if (j_extent > A [i] . Len)
jpayne@69 270 {
jpayne@69 271 A [i] . Len = j_extent;
jpayne@69 272 i_end = A [i] . Start2 + j_extent;
jpayne@69 273 }
jpayne@69 274 A [j] . Good = FALSE;
jpayne@69 275 }
jpayne@69 276 else if (A [i] . Start1 == A [j] . Start1)
jpayne@69 277 {
jpayne@69 278 olap = A [i] . Start2 + A [i] . Len - A [j] . Start2;
jpayne@69 279 if (A [i] . Len < A [j] . Len)
jpayne@69 280 {
jpayne@69 281 if (olap >= A [i] . Len / 2)
jpayne@69 282 {
jpayne@69 283 A [i] . Good = FALSE;
jpayne@69 284 break;
jpayne@69 285 }
jpayne@69 286 }
jpayne@69 287 else if (A [j] . Len < A [i] . Len)
jpayne@69 288 {
jpayne@69 289 if (olap >= A [j] . Len / 2)
jpayne@69 290 {
jpayne@69 291 A [j] . Good = FALSE;
jpayne@69 292 }
jpayne@69 293 }
jpayne@69 294 else
jpayne@69 295 {
jpayne@69 296 if (olap >= A [i] . Len / 2)
jpayne@69 297 {
jpayne@69 298 A [j] . Tentative = TRUE;
jpayne@69 299 if (A [i] . Tentative)
jpayne@69 300 {
jpayne@69 301 A [i] . Good = FALSE;
jpayne@69 302 break;
jpayne@69 303 }
jpayne@69 304 }
jpayne@69 305 }
jpayne@69 306 }
jpayne@69 307 else if (A [i] . Start2 == A [j] . Start2)
jpayne@69 308 {
jpayne@69 309 olap = A [i] . Start1 + A [i] . Len - A [j] . Start1;
jpayne@69 310 if (A [i] . Len < A [j] . Len)
jpayne@69 311 {
jpayne@69 312 if (olap >= A [i] . Len / 2)
jpayne@69 313 {
jpayne@69 314 A [i] . Good = FALSE;
jpayne@69 315 break;
jpayne@69 316 }
jpayne@69 317 }
jpayne@69 318 else if (A [j] . Len < A [i] . Len)
jpayne@69 319 {
jpayne@69 320 if (olap >= A [j] . Len / 2)
jpayne@69 321 {
jpayne@69 322 A [j] . Good = FALSE;
jpayne@69 323 }
jpayne@69 324 }
jpayne@69 325 else
jpayne@69 326 {
jpayne@69 327 if (olap >= A [i] . Len / 2)
jpayne@69 328 {
jpayne@69 329 A [j] . Tentative = TRUE;
jpayne@69 330 if (A [i] . Tentative)
jpayne@69 331 {
jpayne@69 332 A [i] . Good = FALSE;
jpayne@69 333 break;
jpayne@69 334 }
jpayne@69 335 }
jpayne@69 336 }
jpayne@69 337 }
jpayne@69 338 }
jpayne@69 339 }
jpayne@69 340
jpayne@69 341 for (i = j = 0; i < N; i ++)
jpayne@69 342 if (A [i] . Good)
jpayne@69 343 {
jpayne@69 344 if (i != j)
jpayne@69 345 A [j] = A [i];
jpayne@69 346 j ++;
jpayne@69 347 }
jpayne@69 348 N = j;
jpayne@69 349
jpayne@69 350 for (i = 0; i < N; i ++)
jpayne@69 351 A [i] . Good = FALSE;
jpayne@69 352
jpayne@69 353 return;
jpayne@69 354 }
jpayne@69 355
jpayne@69 356
jpayne@69 357
jpayne@69 358 static int Find
jpayne@69 359 (int a)
jpayne@69 360
jpayne@69 361 // Return the id of the set containing a in UF .
jpayne@69 362
jpayne@69 363 {
jpayne@69 364 int i, j, k;
jpayne@69 365
jpayne@69 366 if (UF [a] < 0)
jpayne@69 367 return a;
jpayne@69 368
jpayne@69 369 for (i = a; UF [i] > 0; i = UF [i])
jpayne@69 370 ;
jpayne@69 371 for (j = a; UF [j] != i; j = k)
jpayne@69 372 {
jpayne@69 373 k = UF [j];
jpayne@69 374 UF [j] = i;
jpayne@69 375 }
jpayne@69 376
jpayne@69 377 return i;
jpayne@69 378 }
jpayne@69 379
jpayne@69 380
jpayne@69 381
jpayne@69 382 static void Parse_Command_Line
jpayne@69 383 (int argc, char * argv [])
jpayne@69 384
jpayne@69 385 // Get options and parameters from command line with argc
jpayne@69 386 // arguments in argv [0 .. (argc - 1)] .
jpayne@69 387
jpayne@69 388 {
jpayne@69 389 int ch, errflg = FALSE;
jpayne@69 390 char * p;
jpayne@69 391
jpayne@69 392 optarg = NULL;
jpayne@69 393
jpayne@69 394 while (! errflg
jpayne@69 395 && ((ch = getopt (argc, argv, "Cd:ef:l:s:")) != EOF))
jpayne@69 396 switch (ch)
jpayne@69 397 {
jpayne@69 398 case 'C' :
jpayne@69 399 Check_Labels = TRUE;
jpayne@69 400 break;
jpayne@69 401
jpayne@69 402 case 'd' :
jpayne@69 403 Fixed_Separation = strtol (optarg, & p, 10);
jpayne@69 404 break;
jpayne@69 405
jpayne@69 406 case 'e' :
jpayne@69 407 Use_Extents = TRUE;
jpayne@69 408 break;
jpayne@69 409
jpayne@69 410 case 'f' :
jpayne@69 411 Separation_Factor = strtod (optarg, & p);
jpayne@69 412 break;
jpayne@69 413
jpayne@69 414 case 'l' :
jpayne@69 415 Min_Output_Score = strtol (optarg, & p, 10);
jpayne@69 416 break;
jpayne@69 417
jpayne@69 418 case 's' :
jpayne@69 419 Max_Separation = strtol (optarg, & p, 10);
jpayne@69 420 break;
jpayne@69 421
jpayne@69 422 case '?' :
jpayne@69 423 fprintf (stderr, "Unrecognized option -%c\n", optopt);
jpayne@69 424
jpayne@69 425 default :
jpayne@69 426 errflg = TRUE;
jpayne@69 427 }
jpayne@69 428
jpayne@69 429 if (errflg || optind != argc)
jpayne@69 430 {
jpayne@69 431 Usage (argv [0]);
jpayne@69 432 exit (EXIT_FAILURE);
jpayne@69 433 }
jpayne@69 434
jpayne@69 435 return;
jpayne@69 436 }
jpayne@69 437
jpayne@69 438
jpayne@69 439
jpayne@69 440 static int Process_Cluster
jpayne@69 441 (Match_t * A, int N, char * label)
jpayne@69 442
jpayne@69 443 // Process the cluster of matches in A [0 .. (N - 1)] and output them
jpayne@69 444 // after a line containing label . Return the number of clusters
jpayne@69 445 // printed.
jpayne@69 446
jpayne@69 447 {
jpayne@69 448 long int adj, total, hi, lo, extent, score;
jpayne@69 449 int best, prev;
jpayne@69 450 int print_ct = 0;
jpayne@69 451 int i, j, k;
jpayne@69 452
jpayne@69 453 do
jpayne@69 454 {
jpayne@69 455 for (i = 0; i < N; i ++)
jpayne@69 456 {
jpayne@69 457 A [i] . Simple_Score = A [i] . Len;
jpayne@69 458 A [i] . Simple_Adj = 0;
jpayne@69 459 A [i] . Simple_From = -1;
jpayne@69 460 for (j = 0; j < i; j ++)
jpayne@69 461 {
jpayne@69 462 long int Pen;
jpayne@69 463 long int Olap, Olap1, Olap2;
jpayne@69 464
jpayne@69 465 Olap1 = A [j] . Start1 + A [j] . Len - A [i] . Start1;
jpayne@69 466 Olap = Max (0, Olap1);
jpayne@69 467 Olap2 = A [j] . Start2 + A [j] . Len - A [i] . Start2;
jpayne@69 468 Olap = Max (Olap, Olap2);
jpayne@69 469
jpayne@69 470 // penalize off diagonal matches
jpayne@69 471 Pen = Olap + abs ( (A [i] . Start2 - A [i] . Start1) -
jpayne@69 472 (A [j] . Start2 - A [j] . Start1) );
jpayne@69 473
jpayne@69 474 if (A [j] . Simple_Score + A [i] . Len - Pen > A [i] . Simple_Score)
jpayne@69 475 {
jpayne@69 476 A [i] . Simple_From = j;
jpayne@69 477 A [i] . Simple_Score = A [j] . Simple_Score + A [i] . Len - Pen;
jpayne@69 478 A [i] . Simple_Adj = Olap;
jpayne@69 479 }
jpayne@69 480 }
jpayne@69 481 }
jpayne@69 482
jpayne@69 483 best = 0;
jpayne@69 484 for (i = 1; i < N; i ++)
jpayne@69 485 if (A [i] . Simple_Score > A [best] . Simple_Score)
jpayne@69 486 best = i;
jpayne@69 487 total = 0;
jpayne@69 488 hi = LONG_MIN;
jpayne@69 489 lo = LONG_MAX;
jpayne@69 490 for (i = best; i >= 0; i = A [i] . Simple_From)
jpayne@69 491 {
jpayne@69 492 A [i] . Good = TRUE;
jpayne@69 493 total += A [i] . Len;
jpayne@69 494 if (A [i] . Start1 + A [i] . Len > hi)
jpayne@69 495 hi = A [i] . Start1 + A [i] . Len;
jpayne@69 496 if (A [i] . Start1 < lo)
jpayne@69 497 lo = A [i] . Start1;
jpayne@69 498 }
jpayne@69 499 extent = hi - lo;
jpayne@69 500
jpayne@69 501 if (Use_Extents)
jpayne@69 502 score = extent;
jpayne@69 503 else
jpayne@69 504 score = total;
jpayne@69 505 if (score >= Min_Output_Score)
jpayne@69 506 {
jpayne@69 507 print_ct ++;
jpayne@69 508 fputs (label, stdout);
jpayne@69 509 prev = -1;
jpayne@69 510 for (i = 0; i < N; i ++)
jpayne@69 511 if (A [i] . Good)
jpayne@69 512 {
jpayne@69 513 if (prev == -1)
jpayne@69 514 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
jpayne@69 515 A [i] . Start1, A [i] . Start2, A [i] . Len,
jpayne@69 516 "none", "-", "-");
jpayne@69 517 else
jpayne@69 518 {
jpayne@69 519 adj = A [i] . Simple_Adj;
jpayne@69 520 printf ("%8ld %8ld %6ld",
jpayne@69 521 A [i] . Start1 + adj, A [i] . Start2 + adj,
jpayne@69 522 A [i] . Len - adj);
jpayne@69 523 if (adj == 0)
jpayne@69 524 printf (" %7s", "none");
jpayne@69 525 else
jpayne@69 526 printf (" %7ld", - adj);
jpayne@69 527 printf (" %6ld %6ld\n",
jpayne@69 528 A [i] . Start1 + adj - A [prev] . Start1 - A [prev] . Len,
jpayne@69 529 A [i] . Start2 + adj - A [prev] . Start2 - A [prev] . Len);
jpayne@69 530 }
jpayne@69 531 prev = i;
jpayne@69 532 }
jpayne@69 533 label = "#\n";
jpayne@69 534 }
jpayne@69 535
jpayne@69 536 for (i = k = 0; i < N; i ++)
jpayne@69 537 if (! A [i] . Good)
jpayne@69 538 {
jpayne@69 539 if (i != k)
jpayne@69 540 {
jpayne@69 541 A [k] = A [i];
jpayne@69 542 }
jpayne@69 543 k ++;
jpayne@69 544 }
jpayne@69 545 N = k;
jpayne@69 546 } while (N > 0);
jpayne@69 547
jpayne@69 548 return print_ct;
jpayne@69 549 }
jpayne@69 550
jpayne@69 551
jpayne@69 552
jpayne@69 553
jpayne@69 554 static void Process_Matches
jpayne@69 555 (Match_t * A, int N, char * label)
jpayne@69 556
jpayne@69 557 // Process matches A [1 .. N] and output them after
jpayne@69 558 // a line containing label .
jpayne@69 559
jpayne@69 560 {
jpayne@69 561 long int cluster_size, sep;
jpayne@69 562 int print_ct = 0;
jpayne@69 563 int a, b, i, j;
jpayne@69 564
jpayne@69 565 if (N <= 0)
jpayne@69 566 {
jpayne@69 567 fputs (label, stdout);
jpayne@69 568 return;
jpayne@69 569 }
jpayne@69 570
jpayne@69 571 // Use Union-Find to create connected-components based on
jpayne@69 572 // separation and similar diagonals between matches
jpayne@69 573
jpayne@69 574 for (i = 1; i <= N; i ++)
jpayne@69 575 UF [i] = -1;
jpayne@69 576
jpayne@69 577 qsort (A + 1, N, sizeof (Match_t), By_Start2);
jpayne@69 578
jpayne@69 579 Filter_Matches (A + 1, N);
jpayne@69 580
jpayne@69 581 for (i = 1; i < N; i ++)
jpayne@69 582 {
jpayne@69 583 long int i_end = A [i] . Start2 + A [i] . Len;
jpayne@69 584 long int i_diag = A [i] . Start2 - A [i] . Start1;
jpayne@69 585
jpayne@69 586 for (j = i + 1; j <= N; j ++)
jpayne@69 587 {
jpayne@69 588 long int diag_diff;
jpayne@69 589
jpayne@69 590 sep = A [j] . Start2 - i_end;
jpayne@69 591 if (sep > Max_Separation)
jpayne@69 592 break;
jpayne@69 593
jpayne@69 594 diag_diff = abs ((A [j] . Start2 - A [j] . Start1) - i_diag);
jpayne@69 595 if (diag_diff <= Max (Fixed_Separation, long (Separation_Factor * sep)))
jpayne@69 596 {
jpayne@69 597 a = Find (i);
jpayne@69 598 b = Find (j);
jpayne@69 599 if (a != b)
jpayne@69 600 Union (a, b);
jpayne@69 601 }
jpayne@69 602 }
jpayne@69 603 }
jpayne@69 604
jpayne@69 605 // Set the cluster id of each match
jpayne@69 606
jpayne@69 607 for (i = 1; i <= N; i ++)
jpayne@69 608 A [i] . cluster_id = Find (i);
jpayne@69 609
jpayne@69 610 qsort (A + 1, N, sizeof (Match_t), By_Cluster);
jpayne@69 611
jpayne@69 612 for (i = 1; i <= N; i += cluster_size)
jpayne@69 613 {
jpayne@69 614
jpayne@69 615 for (j = i + 1; j <= N && A [i] . cluster_id == A [j] . cluster_id; j ++)
jpayne@69 616 ;
jpayne@69 617 cluster_size = j - i;
jpayne@69 618 print_ct += Process_Cluster (A + i, cluster_size, label);
jpayne@69 619 if (print_ct > 0)
jpayne@69 620 label = "#\n";
jpayne@69 621 }
jpayne@69 622
jpayne@69 623 #if 0
jpayne@69 624 // Find the largest cluster
jpayne@69 625
jpayne@69 626 j = 1;
jpayne@69 627 for (i = 2; i <= N; i ++)
jpayne@69 628 if (UF [i] < UF [j])
jpayne@69 629 j = i;
jpayne@69 630
jpayne@69 631 // j is now the largest cluster
jpayne@69 632
jpayne@69 633 cluster_size = - UF [j];
jpayne@69 634
jpayne@69 635 for (i = k = 1; i <= N; i ++)
jpayne@69 636 if (Find (i) == j)
jpayne@69 637 {
jpayne@69 638 if (i != k)
jpayne@69 639 {
jpayne@69 640 Match_t tmp = A [i];
jpayne@69 641
jpayne@69 642 A [i] = A [k];
jpayne@69 643 A [k] = tmp;
jpayne@69 644 }
jpayne@69 645 k ++;
jpayne@69 646 }
jpayne@69 647
jpayne@69 648 assert (cluster_size == k - 1);
jpayne@69 649 #endif
jpayne@69 650
jpayne@69 651 if (print_ct == 0)
jpayne@69 652 fputs (label, stdout);
jpayne@69 653
jpayne@69 654 return;
jpayne@69 655 }
jpayne@69 656
jpayne@69 657
jpayne@69 658
jpayne@69 659 static void Union
jpayne@69 660 (int a, int b)
jpayne@69 661
jpayne@69 662 // Union the sets whose id's are a and b in UF .
jpayne@69 663
jpayne@69 664 {
jpayne@69 665 assert (UF [a] < 0 && UF [b] < 0);
jpayne@69 666
jpayne@69 667 if (UF [a] < UF [b])
jpayne@69 668 {
jpayne@69 669 UF [a] += UF [b];
jpayne@69 670 UF [b] = a;
jpayne@69 671 }
jpayne@69 672 else
jpayne@69 673 {
jpayne@69 674 UF [b] += UF [a];
jpayne@69 675 UF [a] = b;
jpayne@69 676 }
jpayne@69 677
jpayne@69 678 return;
jpayne@69 679 }
jpayne@69 680
jpayne@69 681
jpayne@69 682
jpayne@69 683 static void Usage
jpayne@69 684 (char * command)
jpayne@69 685
jpayne@69 686 // Print to stderr description of options and command line for
jpayne@69 687 // this program. command is the command that was used to
jpayne@69 688 // invoke it.
jpayne@69 689
jpayne@69 690 {
jpayne@69 691 fprintf (stderr,
jpayne@69 692 "USAGE: %s [-d <DiagDiff>] [-f <DiagFactor>] [-l <MatchLen>]\n"
jpayne@69 693 " [-s <MaxSeparation>]\n"
jpayne@69 694 "\n"
jpayne@69 695 "Clusters MUMs based on diagonals and separation.\n"
jpayne@69 696 "Input is from stdin in format produced by mummer.\n"
jpayne@69 697 "Ouput goes to stdout.\n"
jpayne@69 698 "\n"
jpayne@69 699 "Options:\n"
jpayne@69 700 "-C Check that fasta header labels alternately have \"Reverse\"\n"
jpayne@69 701 "-d num Fixed diagonal difference to join matches\n"
jpayne@69 702 "-e Use extent of match (end - start) rather than sum of piece\n"
jpayne@69 703 " lengths to determine length of cluster\n"
jpayne@69 704 "-f num Fraction of separation for diagonal difference\n"
jpayne@69 705 "-l num Minimum length of cluster match\n"
jpayne@69 706 "-s num Maximum separation between matches in cluster\n",
jpayne@69 707 command);
jpayne@69 708
jpayne@69 709 return;
jpayne@69 710 }
jpayne@69 711
jpayne@69 712
jpayne@69 713