jpayne@69
|
1 /* Programmer: A. Delcher
|
jpayne@69
|
2 * File: mgaps.cc
|
jpayne@69
|
3 *
|
jpayne@69
|
4 * This program reads lists of unique matches between a sequence of strings
|
jpayne@69
|
5 * and a reference string. For each string in the sequence, it clusters
|
jpayne@69
|
6 * the matches together into groups that may represent longer, inexact
|
jpayne@69
|
7 * matches.
|
jpayne@69
|
8 */
|
jpayne@69
|
9
|
jpayne@69
|
10
|
jpayne@69
|
11 #include "tigrinc.hh"
|
jpayne@69
|
12
|
jpayne@69
|
13
|
jpayne@69
|
14 const int DEFAULT_FIXED_SEPARATION = 5;
|
jpayne@69
|
15 const long int DEFAULT_MAX_SEPARATION = 1000;
|
jpayne@69
|
16 const long int DEFAULT_MIN_OUTPUT_SCORE = 200;
|
jpayne@69
|
17 const double DEFAULT_SEPARATION_FACTOR = 0.05;
|
jpayne@69
|
18
|
jpayne@69
|
19
|
jpayne@69
|
20 struct Match_t
|
jpayne@69
|
21 {
|
jpayne@69
|
22 long int Start1, Start2, Len;
|
jpayne@69
|
23 long int Simple_Score;
|
jpayne@69
|
24 long int Simple_From;
|
jpayne@69
|
25 long int Simple_Adj;
|
jpayne@69
|
26 int cluster_id : 30;
|
jpayne@69
|
27 unsigned int Good : 1;
|
jpayne@69
|
28 unsigned int Tentative : 1;
|
jpayne@69
|
29 };
|
jpayne@69
|
30
|
jpayne@69
|
31
|
jpayne@69
|
32 inline
|
jpayne@69
|
33 long int Max (long int A, long int B)
|
jpayne@69
|
34
|
jpayne@69
|
35 // Return the larger of A and B .
|
jpayne@69
|
36
|
jpayne@69
|
37 {
|
jpayne@69
|
38 if (A < B)
|
jpayne@69
|
39 return B;
|
jpayne@69
|
40 else
|
jpayne@69
|
41 return A;
|
jpayne@69
|
42 }
|
jpayne@69
|
43
|
jpayne@69
|
44
|
jpayne@69
|
45 static int Check_Labels = FALSE;
|
jpayne@69
|
46 static int Fixed_Separation = DEFAULT_FIXED_SEPARATION;
|
jpayne@69
|
47 static long int Max_Separation = DEFAULT_MAX_SEPARATION;
|
jpayne@69
|
48 static long int Min_Output_Score = DEFAULT_MIN_OUTPUT_SCORE;
|
jpayne@69
|
49 static double Separation_Factor = DEFAULT_SEPARATION_FACTOR;
|
jpayne@69
|
50 static int * UF = NULL;
|
jpayne@69
|
51 static int Use_Extents = FALSE;
|
jpayne@69
|
52 // If TRUE use end minus start as length of cluster instead of
|
jpayne@69
|
53 // sum of component lengths
|
jpayne@69
|
54
|
jpayne@69
|
55
|
jpayne@69
|
56 static int By_Start2
|
jpayne@69
|
57 (const void * A, const void * B);
|
jpayne@69
|
58 static int By_Cluster
|
jpayne@69
|
59 (const void * A, const void * B);
|
jpayne@69
|
60 static void Filter_Matches
|
jpayne@69
|
61 (Match_t * A, int & N);
|
jpayne@69
|
62 static int Find
|
jpayne@69
|
63 (int a);
|
jpayne@69
|
64 static void Parse_Command_Line
|
jpayne@69
|
65 (int argc, char * argv []);
|
jpayne@69
|
66 static void Process_Matches
|
jpayne@69
|
67 (Match_t * A, int N, char * label);
|
jpayne@69
|
68 static int Process_Cluster
|
jpayne@69
|
69 (Match_t * A, int N, char * label);
|
jpayne@69
|
70 static void Union
|
jpayne@69
|
71 (int a, int b);
|
jpayne@69
|
72 static void Usage
|
jpayne@69
|
73 (char * command);
|
jpayne@69
|
74
|
jpayne@69
|
75
|
jpayne@69
|
76
|
jpayne@69
|
77
|
jpayne@69
|
78 int main
|
jpayne@69
|
79 (int argc, char * argv [])
|
jpayne@69
|
80
|
jpayne@69
|
81 {
|
jpayne@69
|
82 Match_t * A = NULL;
|
jpayne@69
|
83 char line [MAX_LINE];
|
jpayne@69
|
84 char save [MAX_LINE];
|
jpayne@69
|
85 int first = TRUE;
|
jpayne@69
|
86 int header_line_ct = 0;
|
jpayne@69
|
87 long int S1, S2, Len;
|
jpayne@69
|
88 long int N = 0, Size = 0;
|
jpayne@69
|
89
|
jpayne@69
|
90 Parse_Command_Line (argc, argv);
|
jpayne@69
|
91
|
jpayne@69
|
92 Size = 500;
|
jpayne@69
|
93 A = (Match_t *) Safe_malloc (Size * sizeof (Match_t));
|
jpayne@69
|
94 UF = (int *) Safe_malloc (Size * sizeof (int));
|
jpayne@69
|
95
|
jpayne@69
|
96 while (fgets (line, MAX_LINE, stdin) != NULL)
|
jpayne@69
|
97 {
|
jpayne@69
|
98 if (line [0] == '>')
|
jpayne@69
|
99 {
|
jpayne@69
|
100 if (first)
|
jpayne@69
|
101 first = FALSE;
|
jpayne@69
|
102 else
|
jpayne@69
|
103 Process_Matches (A, N, save);
|
jpayne@69
|
104 N = 0;
|
jpayne@69
|
105 strcpy (save, line);
|
jpayne@69
|
106 if (Check_Labels && (++ header_line_ct % 2 == 0))
|
jpayne@69
|
107 assert (strstr (line, "Reverse") != NULL);
|
jpayne@69
|
108 }
|
jpayne@69
|
109 else if (sscanf (line, "%ld %ld %ld", & S1, & S2, & Len) == 3)
|
jpayne@69
|
110 {
|
jpayne@69
|
111 if (N >= Size - 1)
|
jpayne@69
|
112 {
|
jpayne@69
|
113 Size *= 2;
|
jpayne@69
|
114 A = (Match_t *) Safe_realloc (A, Size * sizeof (Match_t));
|
jpayne@69
|
115 UF = (int *) Safe_realloc (UF, Size * sizeof (int));
|
jpayne@69
|
116 }
|
jpayne@69
|
117 N ++;
|
jpayne@69
|
118 A [N] . Start1 = S1;
|
jpayne@69
|
119 A [N] . Start2 = S2;
|
jpayne@69
|
120 A [N] . Len = Len;
|
jpayne@69
|
121 A [N] . Good = FALSE;
|
jpayne@69
|
122 A [N] . Tentative = FALSE;
|
jpayne@69
|
123 }
|
jpayne@69
|
124 }
|
jpayne@69
|
125
|
jpayne@69
|
126 Process_Matches (A, N, save);
|
jpayne@69
|
127
|
jpayne@69
|
128
|
jpayne@69
|
129 #if 0
|
jpayne@69
|
130 printf ("> Other matches\n");
|
jpayne@69
|
131 Prev = -1;
|
jpayne@69
|
132 for (i = 0; i < N; i ++)
|
jpayne@69
|
133 if (! A [i] . Good)
|
jpayne@69
|
134 {
|
jpayne@69
|
135 if (Prev == -1)
|
jpayne@69
|
136 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
|
jpayne@69
|
137 A [i] . Start1, A [i] . Start2, A [i] . Len,
|
jpayne@69
|
138 "none", "-", "-");
|
jpayne@69
|
139 else
|
jpayne@69
|
140 {
|
jpayne@69
|
141 if (A [i] . Simple_From == Prev)
|
jpayne@69
|
142 Adj = A [i] . Simple_Adj;
|
jpayne@69
|
143 else
|
jpayne@69
|
144 Adj = 0;
|
jpayne@69
|
145 printf ("%8ld %8ld %6ld",
|
jpayne@69
|
146 A [i] . Start1 + Adj, A [i] . Start2 + Adj,
|
jpayne@69
|
147 A [i] . Len - Adj);
|
jpayne@69
|
148 if (Adj == 0)
|
jpayne@69
|
149 printf (" %7s", "none");
|
jpayne@69
|
150 else
|
jpayne@69
|
151 printf (" %7ld", - Adj);
|
jpayne@69
|
152 if (A [i] . Simple_From == Prev)
|
jpayne@69
|
153 printf (" %6ld %6ld\n",
|
jpayne@69
|
154 A [i] . Start1 + Adj - A [Prev] . Start1 - A [Prev] . Len,
|
jpayne@69
|
155 A [i] . Start2 + Adj - A [Prev] . Start2 - A [Prev] . Len);
|
jpayne@69
|
156 else
|
jpayne@69
|
157 printf (" %6s %6s\n", "-", "-");
|
jpayne@69
|
158 }
|
jpayne@69
|
159 Prev = i;
|
jpayne@69
|
160 }
|
jpayne@69
|
161 #endif
|
jpayne@69
|
162
|
jpayne@69
|
163 return 0;
|
jpayne@69
|
164 }
|
jpayne@69
|
165
|
jpayne@69
|
166
|
jpayne@69
|
167
|
jpayne@69
|
168 static int By_Start2
|
jpayne@69
|
169 (const void * A, const void * B)
|
jpayne@69
|
170
|
jpayne@69
|
171 // Return how A and B compare if converted to Match_t
|
jpayne@69
|
172 // based on Start2 value. If Start2 values are equal use
|
jpayne@69
|
173 // Start1 values for comparison.
|
jpayne@69
|
174
|
jpayne@69
|
175 {
|
jpayne@69
|
176 Match_t * x, * y;
|
jpayne@69
|
177
|
jpayne@69
|
178 x = (Match_t *) A;
|
jpayne@69
|
179 y = (Match_t *) B;
|
jpayne@69
|
180
|
jpayne@69
|
181 if (x -> Start2 < y -> Start2)
|
jpayne@69
|
182 return -1;
|
jpayne@69
|
183 else if (x -> Start2 > y -> Start2)
|
jpayne@69
|
184 return 1;
|
jpayne@69
|
185 else if (x -> Start1 < y -> Start1)
|
jpayne@69
|
186 return -1;
|
jpayne@69
|
187 else if (x -> Start1 > y -> Start1)
|
jpayne@69
|
188 return 1;
|
jpayne@69
|
189 else
|
jpayne@69
|
190 return 0;
|
jpayne@69
|
191 }
|
jpayne@69
|
192
|
jpayne@69
|
193
|
jpayne@69
|
194
|
jpayne@69
|
195 static int By_Cluster
|
jpayne@69
|
196 (const void * A, const void * B)
|
jpayne@69
|
197
|
jpayne@69
|
198 // Return how A and B compare if converted to Match_t
|
jpayne@69
|
199 // first based on cluster_id value, then by Start2 value,
|
jpayne@69
|
200 // then by Start1 value.
|
jpayne@69
|
201
|
jpayne@69
|
202 {
|
jpayne@69
|
203 Match_t * x, * y;
|
jpayne@69
|
204
|
jpayne@69
|
205 x = (Match_t *) A;
|
jpayne@69
|
206 y = (Match_t *) B;
|
jpayne@69
|
207
|
jpayne@69
|
208 if (x -> cluster_id < y -> cluster_id)
|
jpayne@69
|
209 return -1;
|
jpayne@69
|
210 else if (x -> cluster_id > y -> cluster_id)
|
jpayne@69
|
211 return 1;
|
jpayne@69
|
212 else if (x -> Start2 < y -> Start2)
|
jpayne@69
|
213 return -1;
|
jpayne@69
|
214 else if (x -> Start2 > y -> Start2)
|
jpayne@69
|
215 return 1;
|
jpayne@69
|
216 else if (x -> Start1 < y -> Start1)
|
jpayne@69
|
217 return -1;
|
jpayne@69
|
218 else if (x -> Start1 > y -> Start1)
|
jpayne@69
|
219 return 1;
|
jpayne@69
|
220 else
|
jpayne@69
|
221 return 0;
|
jpayne@69
|
222 }
|
jpayne@69
|
223
|
jpayne@69
|
224
|
jpayne@69
|
225
|
jpayne@69
|
226 static void Filter_Matches
|
jpayne@69
|
227 (Match_t * A, int & N)
|
jpayne@69
|
228
|
jpayne@69
|
229 // Remove from A [0 .. (N - 1)] any matches that are internal to a repeat,
|
jpayne@69
|
230 // e.g., if seq1 has 27 As and seq2 has 20 then the first and
|
jpayne@69
|
231 // last matches will be kept, but the 6 matches in the middle will
|
jpayne@69
|
232 // be eliminated. Also combine overlapping matches on the same
|
jpayne@69
|
233 // diagonal. Pack all remaining matches into the front of A and
|
jpayne@69
|
234 // reduce the value of N if any matches are removed.
|
jpayne@69
|
235 // Matches in A *MUST* be sorted by Start2 value.
|
jpayne@69
|
236
|
jpayne@69
|
237 {
|
jpayne@69
|
238 int i, j;
|
jpayne@69
|
239
|
jpayne@69
|
240 for (i = 0; i < N; i ++)
|
jpayne@69
|
241 A [i] . Good = TRUE;
|
jpayne@69
|
242
|
jpayne@69
|
243 for (i = 0; i < N - 1; i ++)
|
jpayne@69
|
244 {
|
jpayne@69
|
245 int i_diag, i_end;
|
jpayne@69
|
246
|
jpayne@69
|
247 if (! A [i] . Good)
|
jpayne@69
|
248 continue;
|
jpayne@69
|
249
|
jpayne@69
|
250 i_diag = A [i] . Start2 - A [i] . Start1;
|
jpayne@69
|
251 i_end = A [i] . Start2 + A [i] . Len;
|
jpayne@69
|
252
|
jpayne@69
|
253 for (j = i + 1; j < N && A [j] . Start2 <= i_end; j ++)
|
jpayne@69
|
254 {
|
jpayne@69
|
255 int olap;
|
jpayne@69
|
256 int j_diag;
|
jpayne@69
|
257
|
jpayne@69
|
258 assert (A [i] . Start2 <= A [j] . Start2);
|
jpayne@69
|
259
|
jpayne@69
|
260 if (! A [j] . Good)
|
jpayne@69
|
261 continue;
|
jpayne@69
|
262
|
jpayne@69
|
263 j_diag = A [j] . Start2 - A [j] . Start1;
|
jpayne@69
|
264 if (i_diag == j_diag)
|
jpayne@69
|
265 {
|
jpayne@69
|
266 int j_extent;
|
jpayne@69
|
267
|
jpayne@69
|
268 j_extent = A [j] . Len + A [j] . Start2 - A [i] . Start2;
|
jpayne@69
|
269 if (j_extent > A [i] . Len)
|
jpayne@69
|
270 {
|
jpayne@69
|
271 A [i] . Len = j_extent;
|
jpayne@69
|
272 i_end = A [i] . Start2 + j_extent;
|
jpayne@69
|
273 }
|
jpayne@69
|
274 A [j] . Good = FALSE;
|
jpayne@69
|
275 }
|
jpayne@69
|
276 else if (A [i] . Start1 == A [j] . Start1)
|
jpayne@69
|
277 {
|
jpayne@69
|
278 olap = A [i] . Start2 + A [i] . Len - A [j] . Start2;
|
jpayne@69
|
279 if (A [i] . Len < A [j] . Len)
|
jpayne@69
|
280 {
|
jpayne@69
|
281 if (olap >= A [i] . Len / 2)
|
jpayne@69
|
282 {
|
jpayne@69
|
283 A [i] . Good = FALSE;
|
jpayne@69
|
284 break;
|
jpayne@69
|
285 }
|
jpayne@69
|
286 }
|
jpayne@69
|
287 else if (A [j] . Len < A [i] . Len)
|
jpayne@69
|
288 {
|
jpayne@69
|
289 if (olap >= A [j] . Len / 2)
|
jpayne@69
|
290 {
|
jpayne@69
|
291 A [j] . Good = FALSE;
|
jpayne@69
|
292 }
|
jpayne@69
|
293 }
|
jpayne@69
|
294 else
|
jpayne@69
|
295 {
|
jpayne@69
|
296 if (olap >= A [i] . Len / 2)
|
jpayne@69
|
297 {
|
jpayne@69
|
298 A [j] . Tentative = TRUE;
|
jpayne@69
|
299 if (A [i] . Tentative)
|
jpayne@69
|
300 {
|
jpayne@69
|
301 A [i] . Good = FALSE;
|
jpayne@69
|
302 break;
|
jpayne@69
|
303 }
|
jpayne@69
|
304 }
|
jpayne@69
|
305 }
|
jpayne@69
|
306 }
|
jpayne@69
|
307 else if (A [i] . Start2 == A [j] . Start2)
|
jpayne@69
|
308 {
|
jpayne@69
|
309 olap = A [i] . Start1 + A [i] . Len - A [j] . Start1;
|
jpayne@69
|
310 if (A [i] . Len < A [j] . Len)
|
jpayne@69
|
311 {
|
jpayne@69
|
312 if (olap >= A [i] . Len / 2)
|
jpayne@69
|
313 {
|
jpayne@69
|
314 A [i] . Good = FALSE;
|
jpayne@69
|
315 break;
|
jpayne@69
|
316 }
|
jpayne@69
|
317 }
|
jpayne@69
|
318 else if (A [j] . Len < A [i] . Len)
|
jpayne@69
|
319 {
|
jpayne@69
|
320 if (olap >= A [j] . Len / 2)
|
jpayne@69
|
321 {
|
jpayne@69
|
322 A [j] . Good = FALSE;
|
jpayne@69
|
323 }
|
jpayne@69
|
324 }
|
jpayne@69
|
325 else
|
jpayne@69
|
326 {
|
jpayne@69
|
327 if (olap >= A [i] . Len / 2)
|
jpayne@69
|
328 {
|
jpayne@69
|
329 A [j] . Tentative = TRUE;
|
jpayne@69
|
330 if (A [i] . Tentative)
|
jpayne@69
|
331 {
|
jpayne@69
|
332 A [i] . Good = FALSE;
|
jpayne@69
|
333 break;
|
jpayne@69
|
334 }
|
jpayne@69
|
335 }
|
jpayne@69
|
336 }
|
jpayne@69
|
337 }
|
jpayne@69
|
338 }
|
jpayne@69
|
339 }
|
jpayne@69
|
340
|
jpayne@69
|
341 for (i = j = 0; i < N; i ++)
|
jpayne@69
|
342 if (A [i] . Good)
|
jpayne@69
|
343 {
|
jpayne@69
|
344 if (i != j)
|
jpayne@69
|
345 A [j] = A [i];
|
jpayne@69
|
346 j ++;
|
jpayne@69
|
347 }
|
jpayne@69
|
348 N = j;
|
jpayne@69
|
349
|
jpayne@69
|
350 for (i = 0; i < N; i ++)
|
jpayne@69
|
351 A [i] . Good = FALSE;
|
jpayne@69
|
352
|
jpayne@69
|
353 return;
|
jpayne@69
|
354 }
|
jpayne@69
|
355
|
jpayne@69
|
356
|
jpayne@69
|
357
|
jpayne@69
|
358 static int Find
|
jpayne@69
|
359 (int a)
|
jpayne@69
|
360
|
jpayne@69
|
361 // Return the id of the set containing a in UF .
|
jpayne@69
|
362
|
jpayne@69
|
363 {
|
jpayne@69
|
364 int i, j, k;
|
jpayne@69
|
365
|
jpayne@69
|
366 if (UF [a] < 0)
|
jpayne@69
|
367 return a;
|
jpayne@69
|
368
|
jpayne@69
|
369 for (i = a; UF [i] > 0; i = UF [i])
|
jpayne@69
|
370 ;
|
jpayne@69
|
371 for (j = a; UF [j] != i; j = k)
|
jpayne@69
|
372 {
|
jpayne@69
|
373 k = UF [j];
|
jpayne@69
|
374 UF [j] = i;
|
jpayne@69
|
375 }
|
jpayne@69
|
376
|
jpayne@69
|
377 return i;
|
jpayne@69
|
378 }
|
jpayne@69
|
379
|
jpayne@69
|
380
|
jpayne@69
|
381
|
jpayne@69
|
382 static void Parse_Command_Line
|
jpayne@69
|
383 (int argc, char * argv [])
|
jpayne@69
|
384
|
jpayne@69
|
385 // Get options and parameters from command line with argc
|
jpayne@69
|
386 // arguments in argv [0 .. (argc - 1)] .
|
jpayne@69
|
387
|
jpayne@69
|
388 {
|
jpayne@69
|
389 int ch, errflg = FALSE;
|
jpayne@69
|
390 char * p;
|
jpayne@69
|
391
|
jpayne@69
|
392 optarg = NULL;
|
jpayne@69
|
393
|
jpayne@69
|
394 while (! errflg
|
jpayne@69
|
395 && ((ch = getopt (argc, argv, "Cd:ef:l:s:")) != EOF))
|
jpayne@69
|
396 switch (ch)
|
jpayne@69
|
397 {
|
jpayne@69
|
398 case 'C' :
|
jpayne@69
|
399 Check_Labels = TRUE;
|
jpayne@69
|
400 break;
|
jpayne@69
|
401
|
jpayne@69
|
402 case 'd' :
|
jpayne@69
|
403 Fixed_Separation = strtol (optarg, & p, 10);
|
jpayne@69
|
404 break;
|
jpayne@69
|
405
|
jpayne@69
|
406 case 'e' :
|
jpayne@69
|
407 Use_Extents = TRUE;
|
jpayne@69
|
408 break;
|
jpayne@69
|
409
|
jpayne@69
|
410 case 'f' :
|
jpayne@69
|
411 Separation_Factor = strtod (optarg, & p);
|
jpayne@69
|
412 break;
|
jpayne@69
|
413
|
jpayne@69
|
414 case 'l' :
|
jpayne@69
|
415 Min_Output_Score = strtol (optarg, & p, 10);
|
jpayne@69
|
416 break;
|
jpayne@69
|
417
|
jpayne@69
|
418 case 's' :
|
jpayne@69
|
419 Max_Separation = strtol (optarg, & p, 10);
|
jpayne@69
|
420 break;
|
jpayne@69
|
421
|
jpayne@69
|
422 case '?' :
|
jpayne@69
|
423 fprintf (stderr, "Unrecognized option -%c\n", optopt);
|
jpayne@69
|
424
|
jpayne@69
|
425 default :
|
jpayne@69
|
426 errflg = TRUE;
|
jpayne@69
|
427 }
|
jpayne@69
|
428
|
jpayne@69
|
429 if (errflg || optind != argc)
|
jpayne@69
|
430 {
|
jpayne@69
|
431 Usage (argv [0]);
|
jpayne@69
|
432 exit (EXIT_FAILURE);
|
jpayne@69
|
433 }
|
jpayne@69
|
434
|
jpayne@69
|
435 return;
|
jpayne@69
|
436 }
|
jpayne@69
|
437
|
jpayne@69
|
438
|
jpayne@69
|
439
|
jpayne@69
|
440 static int Process_Cluster
|
jpayne@69
|
441 (Match_t * A, int N, char * label)
|
jpayne@69
|
442
|
jpayne@69
|
443 // Process the cluster of matches in A [0 .. (N - 1)] and output them
|
jpayne@69
|
444 // after a line containing label . Return the number of clusters
|
jpayne@69
|
445 // printed.
|
jpayne@69
|
446
|
jpayne@69
|
447 {
|
jpayne@69
|
448 long int adj, total, hi, lo, extent, score;
|
jpayne@69
|
449 int best, prev;
|
jpayne@69
|
450 int print_ct = 0;
|
jpayne@69
|
451 int i, j, k;
|
jpayne@69
|
452
|
jpayne@69
|
453 do
|
jpayne@69
|
454 {
|
jpayne@69
|
455 for (i = 0; i < N; i ++)
|
jpayne@69
|
456 {
|
jpayne@69
|
457 A [i] . Simple_Score = A [i] . Len;
|
jpayne@69
|
458 A [i] . Simple_Adj = 0;
|
jpayne@69
|
459 A [i] . Simple_From = -1;
|
jpayne@69
|
460 for (j = 0; j < i; j ++)
|
jpayne@69
|
461 {
|
jpayne@69
|
462 long int Pen;
|
jpayne@69
|
463 long int Olap, Olap1, Olap2;
|
jpayne@69
|
464
|
jpayne@69
|
465 Olap1 = A [j] . Start1 + A [j] . Len - A [i] . Start1;
|
jpayne@69
|
466 Olap = Max (0, Olap1);
|
jpayne@69
|
467 Olap2 = A [j] . Start2 + A [j] . Len - A [i] . Start2;
|
jpayne@69
|
468 Olap = Max (Olap, Olap2);
|
jpayne@69
|
469
|
jpayne@69
|
470 // penalize off diagonal matches
|
jpayne@69
|
471 Pen = Olap + abs ( (A [i] . Start2 - A [i] . Start1) -
|
jpayne@69
|
472 (A [j] . Start2 - A [j] . Start1) );
|
jpayne@69
|
473
|
jpayne@69
|
474 if (A [j] . Simple_Score + A [i] . Len - Pen > A [i] . Simple_Score)
|
jpayne@69
|
475 {
|
jpayne@69
|
476 A [i] . Simple_From = j;
|
jpayne@69
|
477 A [i] . Simple_Score = A [j] . Simple_Score + A [i] . Len - Pen;
|
jpayne@69
|
478 A [i] . Simple_Adj = Olap;
|
jpayne@69
|
479 }
|
jpayne@69
|
480 }
|
jpayne@69
|
481 }
|
jpayne@69
|
482
|
jpayne@69
|
483 best = 0;
|
jpayne@69
|
484 for (i = 1; i < N; i ++)
|
jpayne@69
|
485 if (A [i] . Simple_Score > A [best] . Simple_Score)
|
jpayne@69
|
486 best = i;
|
jpayne@69
|
487 total = 0;
|
jpayne@69
|
488 hi = LONG_MIN;
|
jpayne@69
|
489 lo = LONG_MAX;
|
jpayne@69
|
490 for (i = best; i >= 0; i = A [i] . Simple_From)
|
jpayne@69
|
491 {
|
jpayne@69
|
492 A [i] . Good = TRUE;
|
jpayne@69
|
493 total += A [i] . Len;
|
jpayne@69
|
494 if (A [i] . Start1 + A [i] . Len > hi)
|
jpayne@69
|
495 hi = A [i] . Start1 + A [i] . Len;
|
jpayne@69
|
496 if (A [i] . Start1 < lo)
|
jpayne@69
|
497 lo = A [i] . Start1;
|
jpayne@69
|
498 }
|
jpayne@69
|
499 extent = hi - lo;
|
jpayne@69
|
500
|
jpayne@69
|
501 if (Use_Extents)
|
jpayne@69
|
502 score = extent;
|
jpayne@69
|
503 else
|
jpayne@69
|
504 score = total;
|
jpayne@69
|
505 if (score >= Min_Output_Score)
|
jpayne@69
|
506 {
|
jpayne@69
|
507 print_ct ++;
|
jpayne@69
|
508 fputs (label, stdout);
|
jpayne@69
|
509 prev = -1;
|
jpayne@69
|
510 for (i = 0; i < N; i ++)
|
jpayne@69
|
511 if (A [i] . Good)
|
jpayne@69
|
512 {
|
jpayne@69
|
513 if (prev == -1)
|
jpayne@69
|
514 printf ("%8ld %8ld %6ld %7s %6s %6s\n",
|
jpayne@69
|
515 A [i] . Start1, A [i] . Start2, A [i] . Len,
|
jpayne@69
|
516 "none", "-", "-");
|
jpayne@69
|
517 else
|
jpayne@69
|
518 {
|
jpayne@69
|
519 adj = A [i] . Simple_Adj;
|
jpayne@69
|
520 printf ("%8ld %8ld %6ld",
|
jpayne@69
|
521 A [i] . Start1 + adj, A [i] . Start2 + adj,
|
jpayne@69
|
522 A [i] . Len - adj);
|
jpayne@69
|
523 if (adj == 0)
|
jpayne@69
|
524 printf (" %7s", "none");
|
jpayne@69
|
525 else
|
jpayne@69
|
526 printf (" %7ld", - adj);
|
jpayne@69
|
527 printf (" %6ld %6ld\n",
|
jpayne@69
|
528 A [i] . Start1 + adj - A [prev] . Start1 - A [prev] . Len,
|
jpayne@69
|
529 A [i] . Start2 + adj - A [prev] . Start2 - A [prev] . Len);
|
jpayne@69
|
530 }
|
jpayne@69
|
531 prev = i;
|
jpayne@69
|
532 }
|
jpayne@69
|
533 label = "#\n";
|
jpayne@69
|
534 }
|
jpayne@69
|
535
|
jpayne@69
|
536 for (i = k = 0; i < N; i ++)
|
jpayne@69
|
537 if (! A [i] . Good)
|
jpayne@69
|
538 {
|
jpayne@69
|
539 if (i != k)
|
jpayne@69
|
540 {
|
jpayne@69
|
541 A [k] = A [i];
|
jpayne@69
|
542 }
|
jpayne@69
|
543 k ++;
|
jpayne@69
|
544 }
|
jpayne@69
|
545 N = k;
|
jpayne@69
|
546 } while (N > 0);
|
jpayne@69
|
547
|
jpayne@69
|
548 return print_ct;
|
jpayne@69
|
549 }
|
jpayne@69
|
550
|
jpayne@69
|
551
|
jpayne@69
|
552
|
jpayne@69
|
553
|
jpayne@69
|
554 static void Process_Matches
|
jpayne@69
|
555 (Match_t * A, int N, char * label)
|
jpayne@69
|
556
|
jpayne@69
|
557 // Process matches A [1 .. N] and output them after
|
jpayne@69
|
558 // a line containing label .
|
jpayne@69
|
559
|
jpayne@69
|
560 {
|
jpayne@69
|
561 long int cluster_size, sep;
|
jpayne@69
|
562 int print_ct = 0;
|
jpayne@69
|
563 int a, b, i, j;
|
jpayne@69
|
564
|
jpayne@69
|
565 if (N <= 0)
|
jpayne@69
|
566 {
|
jpayne@69
|
567 fputs (label, stdout);
|
jpayne@69
|
568 return;
|
jpayne@69
|
569 }
|
jpayne@69
|
570
|
jpayne@69
|
571 // Use Union-Find to create connected-components based on
|
jpayne@69
|
572 // separation and similar diagonals between matches
|
jpayne@69
|
573
|
jpayne@69
|
574 for (i = 1; i <= N; i ++)
|
jpayne@69
|
575 UF [i] = -1;
|
jpayne@69
|
576
|
jpayne@69
|
577 qsort (A + 1, N, sizeof (Match_t), By_Start2);
|
jpayne@69
|
578
|
jpayne@69
|
579 Filter_Matches (A + 1, N);
|
jpayne@69
|
580
|
jpayne@69
|
581 for (i = 1; i < N; i ++)
|
jpayne@69
|
582 {
|
jpayne@69
|
583 long int i_end = A [i] . Start2 + A [i] . Len;
|
jpayne@69
|
584 long int i_diag = A [i] . Start2 - A [i] . Start1;
|
jpayne@69
|
585
|
jpayne@69
|
586 for (j = i + 1; j <= N; j ++)
|
jpayne@69
|
587 {
|
jpayne@69
|
588 long int diag_diff;
|
jpayne@69
|
589
|
jpayne@69
|
590 sep = A [j] . Start2 - i_end;
|
jpayne@69
|
591 if (sep > Max_Separation)
|
jpayne@69
|
592 break;
|
jpayne@69
|
593
|
jpayne@69
|
594 diag_diff = abs ((A [j] . Start2 - A [j] . Start1) - i_diag);
|
jpayne@69
|
595 if (diag_diff <= Max (Fixed_Separation, long (Separation_Factor * sep)))
|
jpayne@69
|
596 {
|
jpayne@69
|
597 a = Find (i);
|
jpayne@69
|
598 b = Find (j);
|
jpayne@69
|
599 if (a != b)
|
jpayne@69
|
600 Union (a, b);
|
jpayne@69
|
601 }
|
jpayne@69
|
602 }
|
jpayne@69
|
603 }
|
jpayne@69
|
604
|
jpayne@69
|
605 // Set the cluster id of each match
|
jpayne@69
|
606
|
jpayne@69
|
607 for (i = 1; i <= N; i ++)
|
jpayne@69
|
608 A [i] . cluster_id = Find (i);
|
jpayne@69
|
609
|
jpayne@69
|
610 qsort (A + 1, N, sizeof (Match_t), By_Cluster);
|
jpayne@69
|
611
|
jpayne@69
|
612 for (i = 1; i <= N; i += cluster_size)
|
jpayne@69
|
613 {
|
jpayne@69
|
614
|
jpayne@69
|
615 for (j = i + 1; j <= N && A [i] . cluster_id == A [j] . cluster_id; j ++)
|
jpayne@69
|
616 ;
|
jpayne@69
|
617 cluster_size = j - i;
|
jpayne@69
|
618 print_ct += Process_Cluster (A + i, cluster_size, label);
|
jpayne@69
|
619 if (print_ct > 0)
|
jpayne@69
|
620 label = "#\n";
|
jpayne@69
|
621 }
|
jpayne@69
|
622
|
jpayne@69
|
623 #if 0
|
jpayne@69
|
624 // Find the largest cluster
|
jpayne@69
|
625
|
jpayne@69
|
626 j = 1;
|
jpayne@69
|
627 for (i = 2; i <= N; i ++)
|
jpayne@69
|
628 if (UF [i] < UF [j])
|
jpayne@69
|
629 j = i;
|
jpayne@69
|
630
|
jpayne@69
|
631 // j is now the largest cluster
|
jpayne@69
|
632
|
jpayne@69
|
633 cluster_size = - UF [j];
|
jpayne@69
|
634
|
jpayne@69
|
635 for (i = k = 1; i <= N; i ++)
|
jpayne@69
|
636 if (Find (i) == j)
|
jpayne@69
|
637 {
|
jpayne@69
|
638 if (i != k)
|
jpayne@69
|
639 {
|
jpayne@69
|
640 Match_t tmp = A [i];
|
jpayne@69
|
641
|
jpayne@69
|
642 A [i] = A [k];
|
jpayne@69
|
643 A [k] = tmp;
|
jpayne@69
|
644 }
|
jpayne@69
|
645 k ++;
|
jpayne@69
|
646 }
|
jpayne@69
|
647
|
jpayne@69
|
648 assert (cluster_size == k - 1);
|
jpayne@69
|
649 #endif
|
jpayne@69
|
650
|
jpayne@69
|
651 if (print_ct == 0)
|
jpayne@69
|
652 fputs (label, stdout);
|
jpayne@69
|
653
|
jpayne@69
|
654 return;
|
jpayne@69
|
655 }
|
jpayne@69
|
656
|
jpayne@69
|
657
|
jpayne@69
|
658
|
jpayne@69
|
659 static void Union
|
jpayne@69
|
660 (int a, int b)
|
jpayne@69
|
661
|
jpayne@69
|
662 // Union the sets whose id's are a and b in UF .
|
jpayne@69
|
663
|
jpayne@69
|
664 {
|
jpayne@69
|
665 assert (UF [a] < 0 && UF [b] < 0);
|
jpayne@69
|
666
|
jpayne@69
|
667 if (UF [a] < UF [b])
|
jpayne@69
|
668 {
|
jpayne@69
|
669 UF [a] += UF [b];
|
jpayne@69
|
670 UF [b] = a;
|
jpayne@69
|
671 }
|
jpayne@69
|
672 else
|
jpayne@69
|
673 {
|
jpayne@69
|
674 UF [b] += UF [a];
|
jpayne@69
|
675 UF [a] = b;
|
jpayne@69
|
676 }
|
jpayne@69
|
677
|
jpayne@69
|
678 return;
|
jpayne@69
|
679 }
|
jpayne@69
|
680
|
jpayne@69
|
681
|
jpayne@69
|
682
|
jpayne@69
|
683 static void Usage
|
jpayne@69
|
684 (char * command)
|
jpayne@69
|
685
|
jpayne@69
|
686 // Print to stderr description of options and command line for
|
jpayne@69
|
687 // this program. command is the command that was used to
|
jpayne@69
|
688 // invoke it.
|
jpayne@69
|
689
|
jpayne@69
|
690 {
|
jpayne@69
|
691 fprintf (stderr,
|
jpayne@69
|
692 "USAGE: %s [-d <DiagDiff>] [-f <DiagFactor>] [-l <MatchLen>]\n"
|
jpayne@69
|
693 " [-s <MaxSeparation>]\n"
|
jpayne@69
|
694 "\n"
|
jpayne@69
|
695 "Clusters MUMs based on diagonals and separation.\n"
|
jpayne@69
|
696 "Input is from stdin in format produced by mummer.\n"
|
jpayne@69
|
697 "Ouput goes to stdout.\n"
|
jpayne@69
|
698 "\n"
|
jpayne@69
|
699 "Options:\n"
|
jpayne@69
|
700 "-C Check that fasta header labels alternately have \"Reverse\"\n"
|
jpayne@69
|
701 "-d num Fixed diagonal difference to join matches\n"
|
jpayne@69
|
702 "-e Use extent of match (end - start) rather than sum of piece\n"
|
jpayne@69
|
703 " lengths to determine length of cluster\n"
|
jpayne@69
|
704 "-f num Fraction of separation for diagonal difference\n"
|
jpayne@69
|
705 "-l num Minimum length of cluster match\n"
|
jpayne@69
|
706 "-s num Maximum separation between matches in cluster\n",
|
jpayne@69
|
707 command);
|
jpayne@69
|
708
|
jpayne@69
|
709 return;
|
jpayne@69
|
710 }
|
jpayne@69
|
711
|
jpayne@69
|
712
|
jpayne@69
|
713
|