Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-diff.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 //----------------------------------------------------------------------------- | |
2 // Programmer: Adam M Phillippy, University of Maryland | |
3 // File: show-diff.cc | |
4 // Date: 12 / 01 / 2006 | |
5 // | |
6 // Usage: show-diff [options] <deltafile> | |
7 // Try 'show-diff -h' for more information | |
8 // | |
9 //----------------------------------------------------------------------------- | |
10 | |
11 #include "delta.hh" | |
12 #include "tigrinc.hh" | |
13 #include <string> | |
14 #include <cstdlib> | |
15 #include <cassert> | |
16 #include <climits> | |
17 #include <algorithm> | |
18 #include <vector> | |
19 | |
20 using namespace std; | |
21 | |
22 | |
23 //================================================================ Options ==== | |
24 string OPT_AlignName; // delta file name | |
25 bool OPT_AMOS = false; // AMOS output | |
26 bool OPT_RefDiff = false; // output reference diff | |
27 bool OPT_QryDiff = false; // output query diff | |
28 bool OPT_PrintHeader = true; // -H option | |
29 | |
30 | |
31 //=========================================================== Declarations ==== | |
32 struct EdgeletLoQCmp_t | |
33 //!< Sorts query by lo coord, lo to hi | |
34 { | |
35 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const | |
36 { return ( i->loQ < j->loQ ); } | |
37 }; | |
38 | |
39 struct EdgeletIdQLoQCmp_t | |
40 //!< Sorts query by ID and lo coord, lo to hi | |
41 { | |
42 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const | |
43 { | |
44 if ( i->edge && j->edge ) | |
45 { | |
46 if ( i->edge->qrynode < j->edge->qrynode ) | |
47 return true; | |
48 else if ( i->edge->qrynode > j->edge->qrynode ) | |
49 return false; | |
50 } | |
51 else if ( !i->edge && j->edge ) | |
52 return true; | |
53 else if ( i->edge && !j->edge ) | |
54 return false; | |
55 return ( i->loQ < j->loQ ); | |
56 } | |
57 }; | |
58 | |
59 struct EdgeletLoRCmp_t | |
60 //!< Sorts reference by lo coord, lo to hi | |
61 { | |
62 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const | |
63 { return ( i->loR < j->loR ); } | |
64 }; | |
65 | |
66 struct EdgeletIdRLoRCmp_t | |
67 //!< Sorts reference by ID and lo coord, lo to hi | |
68 { | |
69 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const | |
70 { | |
71 if ( i->edge && j->edge ) | |
72 { | |
73 if ( i->edge->refnode < j->edge->refnode ) | |
74 return true; | |
75 else if ( i->edge->refnode > j->edge->refnode ) | |
76 return false; | |
77 } | |
78 else if ( !i->edge && j->edge ) | |
79 return true; | |
80 else if ( i->edge && !j->edge ) | |
81 return false; | |
82 return ( i->loR < j->loR ); | |
83 } | |
84 }; | |
85 | |
86 | |
87 void PrintDiff(DeltaGraph_t & graph); | |
88 void PrintBrk(const char* seq, long s, long e); | |
89 void PrintSeqJmp(const char* seq, | |
90 const char* seqp, const char* seqn, | |
91 long s, long e); | |
92 void PrintLisJmp(const char* seq, long s, long e); | |
93 void PrintInv(const char* seq, long s, long e); | |
94 void PrintGap(const char* seq, long s, long e, long gap1, long gap2); | |
95 void PrintDup(const char* seq, long s, long e); | |
96 void ParseArgs(int argc, char ** argv); | |
97 void PrintHelp(const char * s); | |
98 void PrintUsage(const char * s); | |
99 | |
100 | |
101 //============================================================ Definitions ==== | |
102 //------------------------------------------------------------------- main ---- | |
103 int main(int argc, char **argv) | |
104 { | |
105 DeltaGraph_t graph; | |
106 | |
107 ParseArgs(argc, argv); | |
108 | |
109 //-- Get M-to-M alignment, i.e. union of QLIS and RLIS | |
110 graph.build(OPT_AlignName, false); | |
111 graph.flagMtoM(); | |
112 graph.clean(); | |
113 | |
114 //-- Output diff | |
115 PrintDiff(graph); | |
116 | |
117 return EXIT_SUCCESS; | |
118 } | |
119 | |
120 | |
121 //-------------------------------------------------------------- PrintDiff ---- | |
122 void PrintDiff(DeltaGraph_t & graph) | |
123 { | |
124 if ( OPT_PrintHeader ) | |
125 { | |
126 printf ("%s %s\n%s\n\n", | |
127 graph . refpath . c_str( ), graph . qrypath . c_str( ), | |
128 graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER"); | |
129 printf ("%s\t%s\t%s\t%s\t%s\n", | |
130 "[SEQ]", "[TYPE]", "[S1]", "[E1]", "[LEN 1]"); | |
131 } | |
132 | |
133 const char* refid; | |
134 const char* qryid; | |
135 long i,j; | |
136 long nAligns, gapR, gapQ; | |
137 DeltaEdgelet_t lpad, rpad; // padding for the alignment vector | |
138 lpad.isRLIS = rpad.isRLIS = true; | |
139 lpad.isQLIS = rpad.isQLIS = true; | |
140 lpad.loR = lpad.hiR = lpad.loQ = lpad.hiQ = 0; | |
141 | |
142 DeltaEdgelet_t *A, *PA, *PGA; // alignment, prev, prev global | |
143 vector<DeltaEdgelet_t *> aligns; | |
144 | |
145 map<string, DeltaNode_t>::const_iterator mi; | |
146 vector<DeltaEdge_t *>::const_iterator ei; | |
147 vector<DeltaEdgelet_t *>::iterator eli; | |
148 | |
149 //-- For each reference sequence | |
150 if ( OPT_RefDiff ) | |
151 for ( mi = graph.refnodes.begin(); mi != graph.refnodes.end(); ++ mi ) | |
152 { | |
153 refid = mi->first.c_str(); | |
154 | |
155 //-- Collect all alignments for this reference sequence | |
156 aligns.clear(); | |
157 for ( ei = (mi->second).edges.begin(); | |
158 ei != (mi->second).edges.end(); ++ei ) | |
159 for ( eli = (*ei)->edgelets.begin(); | |
160 eli != (*ei)->edgelets.end(); ++eli ) | |
161 aligns.push_back(*eli); | |
162 | |
163 //-- Pad the front and back of the alignment vector | |
164 rpad.loR = rpad.hiR = mi->second.len + 1; | |
165 rpad.loQ = rpad.hiQ = LONG_MAX; | |
166 aligns.push_back(&lpad); | |
167 aligns.push_back(&rpad); | |
168 | |
169 nAligns = aligns.size(); | |
170 | |
171 //-- OVERRIDE *stpc* value with loQ QLIS ordering | |
172 sort(aligns.begin(), aligns.end(), EdgeletIdQLoQCmp_t()); | |
173 for ( i = 0, j = 0; i != nAligns; ++i ) | |
174 aligns[i]->stpc = aligns[i]->isQLIS ? j++ : -1; | |
175 | |
176 //-- Sort by reference order | |
177 sort(aligns.begin(), aligns.end(), EdgeletLoRCmp_t()); | |
178 assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad ); | |
179 | |
180 //-- Walk reference cover alignments, low to high | |
181 PA = PGA = aligns[0]; | |
182 for ( i = 1; i != nAligns; ++i ) | |
183 { | |
184 //-- Only interested in reference covering alignments | |
185 if ( !aligns[i]->isRLIS ) continue; | |
186 | |
187 A = aligns[i]; | |
188 gapR = A->loR - PA->hiR - 1; | |
189 | |
190 //-- Reached end of alignments | |
191 if ( A->edge == NULL ) | |
192 { | |
193 PrintBrk(refid, PA->hiR+1, A->loR-1); | |
194 } | |
195 //-- 1-to-1 alignment | |
196 else if ( A->isQLIS && A->edge == PGA->edge ) | |
197 { | |
198 //-- Jump within Q | |
199 if ( A->slope() != PGA->slope() || | |
200 A->stpc != PGA->stpc + PGA->slope() ) | |
201 { | |
202 if ( A->slope() == PGA->slope() ) | |
203 PrintLisJmp(refid, PA->hiR+1, A->loR-1); | |
204 else | |
205 PrintInv(refid, PA->hiR+1, A->loR-1); | |
206 } | |
207 //-- Lined up, nothing in between | |
208 else if ( PA == PGA ) | |
209 { | |
210 gapQ = A->isPositive() ? | |
211 A->loQ - PGA->hiQ - 1 : | |
212 PGA->loQ - A->hiQ - 1; | |
213 PrintGap(refid, PA->hiR+1, A->loR-1, gapR, gapQ); | |
214 } | |
215 //-- Lined up, duplication in between | |
216 else | |
217 { | |
218 PrintBrk(refid, PA->hiR+1, A->loR-1); | |
219 } | |
220 } | |
221 //-- Not in QLIS? Must be a duplication in R | |
222 else if ( !A->isQLIS ) | |
223 { | |
224 PrintBrk(refid, PA->hiR+1, A->loR-1); | |
225 PrintDup(refid, A->loR, A->hiR); | |
226 } | |
227 //-- A->edge != PGA->edge? Jump to different query sequence | |
228 else if ( PGA->edge != NULL ) | |
229 { | |
230 PrintSeqJmp(refid, | |
231 PGA->edge->qrynode->id->c_str(), | |
232 A->edge->qrynode->id->c_str(), | |
233 PA->hiR+1, A->loR-1); | |
234 } | |
235 //-- Gap before first alignment | |
236 else | |
237 { | |
238 PrintBrk(refid, PA->hiR+1, A->loR-1); | |
239 } | |
240 | |
241 if ( A->isQLIS ) | |
242 PGA = A; | |
243 PA = A; | |
244 } | |
245 } | |
246 | |
247 | |
248 //---------- WARNING! Same code as above but Q's for R's and R's for Q's | |
249 if ( OPT_QryDiff ) | |
250 for ( mi = graph.qrynodes.begin(); mi != graph.qrynodes.end(); ++ mi ) | |
251 { | |
252 qryid = mi->first.c_str(); | |
253 | |
254 aligns.clear(); | |
255 for ( ei = (mi->second).edges.begin(); | |
256 ei != (mi->second).edges.end(); ++ei ) | |
257 for ( eli = (*ei)->edgelets.begin(); | |
258 eli != (*ei)->edgelets.end(); ++eli ) | |
259 aligns.push_back(*eli); | |
260 | |
261 rpad.loQ = rpad.hiQ = mi->second.len + 1; | |
262 rpad.loR = rpad.hiR = LONG_MAX; | |
263 aligns.push_back(&lpad); | |
264 aligns.push_back(&rpad); | |
265 | |
266 nAligns = aligns.size(); | |
267 | |
268 sort(aligns.begin(), aligns.end(), EdgeletIdRLoRCmp_t()); | |
269 for ( i = 0, j = 0; i != nAligns; ++i ) | |
270 aligns[i]->stpc = aligns[i]->isRLIS ? j++ : -1; | |
271 | |
272 sort(aligns.begin(), aligns.end(), EdgeletLoQCmp_t()); | |
273 assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad ); | |
274 | |
275 PA = PGA = aligns[0]; | |
276 for ( i = 1; i != nAligns; ++i ) | |
277 { | |
278 if ( !aligns[i]->isQLIS ) continue; | |
279 | |
280 A = aligns[i]; | |
281 gapQ = A->loQ - PA->hiQ - 1; | |
282 | |
283 if ( A->edge == NULL ) | |
284 { | |
285 PrintBrk(qryid, PA->hiQ+1, A->loQ-1); | |
286 } | |
287 else if ( A->isRLIS && A->edge == PGA->edge ) | |
288 { | |
289 if ( A->slope() != PGA->slope() || | |
290 A->stpc != PGA->stpc + PGA->slope() ) | |
291 { | |
292 if ( A->slope() == PGA->slope() ) | |
293 PrintLisJmp(qryid, PA->hiQ+1, A->loQ-1); | |
294 else | |
295 PrintInv(qryid, PA->hiQ+1, A->loQ-1); | |
296 } | |
297 else if ( PA == PGA ) | |
298 { | |
299 gapR = A->isPositive() ? | |
300 A->loR - PGA->hiR - 1 : | |
301 PGA->loR - A->hiR - 1; | |
302 PrintGap(qryid, PA->hiQ+1, A->loQ-1, gapQ, gapR); | |
303 } | |
304 else | |
305 { | |
306 PrintBrk(qryid, PA->hiQ+1, A->loQ-1); | |
307 } | |
308 } | |
309 else if ( !A->isRLIS ) | |
310 { | |
311 PrintBrk(qryid, PA->hiQ+1, A->loQ-1); | |
312 PrintDup(qryid, A->loQ, A->hiQ); | |
313 } | |
314 else if ( PGA->edge != NULL ) | |
315 { | |
316 PrintSeqJmp(qryid, | |
317 PA->edge->refnode->id->c_str(), | |
318 A->edge->refnode->id->c_str(), | |
319 PA->hiQ+1, A->loQ-1); | |
320 } | |
321 else | |
322 { | |
323 PrintBrk(qryid, PA->hiQ+1, A->loQ-1); | |
324 } | |
325 | |
326 if ( A->isRLIS ) | |
327 PGA = A; | |
328 PA = A; | |
329 } | |
330 } | |
331 } | |
332 | |
333 | |
334 void PrintBrk(const char* seq, long s, long e) | |
335 { | |
336 if ( e-s+1 <= 0 ) return; | |
337 | |
338 if ( !OPT_AMOS ) | |
339 printf("%s\tBRK\t%ld\t%ld\t%ld\n", | |
340 seq, s, e, e-s+1); | |
341 else | |
342 printf("%s\tA\t%ld\t%ld\tBRK\t%ld\t%ld\t%ld\n", | |
343 seq, s, e, s, e, e-s+1); | |
344 } | |
345 | |
346 void PrintSeqJmp(const char* seq, | |
347 const char* seqp, | |
348 const char* seqn, | |
349 long s, long e) | |
350 { | |
351 if ( !OPT_AMOS ) | |
352 printf("%s\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n", | |
353 seq, s, e, e-s+1, seqp, seqn); | |
354 else | |
355 printf("%s\tA\t%ld\t%ld\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n", | |
356 seq, s, e, s, e, e-s+1, seqp, seqn); | |
357 } | |
358 | |
359 void PrintLisJmp(const char* seq, long s, long e) | |
360 { | |
361 if ( !OPT_AMOS ) | |
362 printf("%s\tJMP\t%ld\t%ld\t%ld\n", | |
363 seq, s, e, e-s+1); | |
364 else | |
365 printf("%s\tA\t%ld\t%ld\tJMP\t%ld\t%ld\t%ld\n", | |
366 seq, s, e, s, e, e-s+1); | |
367 } | |
368 | |
369 void PrintInv(const char* seq, long s, long e) | |
370 { | |
371 if ( !OPT_AMOS ) | |
372 printf("%s\tINV\t%ld\t%ld\t%ld\n", | |
373 seq, s, e, e-s+1); | |
374 else | |
375 printf("%s\tA\t%ld\t%ld\tINV\t%ld\t%ld\t%ld\n", | |
376 seq, s, e, s, e, e-s+1); | |
377 } | |
378 | |
379 void PrintGap(const char* seq, long s, long e, long gap1, long gap2) | |
380 { | |
381 if ( !OPT_AMOS ) | |
382 printf("%s\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n", | |
383 seq, s, e, gap1, gap2, gap1-gap2); | |
384 else | |
385 printf("%s\tA\t%ld\t%ld\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n", | |
386 seq, s, e, s, e, gap1, gap2, gap1-gap2); | |
387 } | |
388 | |
389 void PrintDup(const char* seq, long s, long e) | |
390 { | |
391 if ( !OPT_AMOS ) | |
392 printf("%s\tDUP\t%ld\t%ld\t%ld\n", | |
393 seq, s, e, e-s+1); | |
394 else | |
395 printf("%s\tA\t%ld\t%ld\tDUP\t%ld\t%ld\t%ld\n", | |
396 seq, s, e, s, e, e-s+1); | |
397 } | |
398 | |
399 | |
400 //-------------------------------------------------------------- ParseArgs ---- | |
401 void ParseArgs (int argc, char ** argv) | |
402 { | |
403 int ch, errflg = 0; | |
404 optarg = NULL; | |
405 | |
406 while ( !errflg && | |
407 ((ch = getopt (argc, argv, "fhHqr")) != EOF) ) | |
408 switch (ch) | |
409 { | |
410 case 'f': | |
411 OPT_AMOS = true; | |
412 break; | |
413 | |
414 case 'h': | |
415 PrintHelp (argv[0]); | |
416 exit (EXIT_SUCCESS); | |
417 break; | |
418 | |
419 case 'H': | |
420 OPT_PrintHeader = false; | |
421 break; | |
422 | |
423 case 'q': | |
424 OPT_QryDiff = true; | |
425 break; | |
426 | |
427 case 'r': | |
428 OPT_RefDiff = true; | |
429 break; | |
430 | |
431 default: | |
432 errflg ++; | |
433 } | |
434 | |
435 if ( OPT_RefDiff && OPT_QryDiff ) | |
436 { | |
437 cerr << "ERROR: -r and -q options cannot be combined\n"; | |
438 errflg++; | |
439 } | |
440 if ( !OPT_RefDiff && !OPT_QryDiff ) | |
441 OPT_RefDiff = true; | |
442 | |
443 if ( errflg > 0 || optind != argc - 1 ) | |
444 { | |
445 PrintUsage(argv[0]); | |
446 cerr << "Try '" << argv[0] << " -h' for more information.\n"; | |
447 exit(EXIT_FAILURE); | |
448 } | |
449 | |
450 OPT_AlignName = argv [optind ++]; | |
451 } | |
452 | |
453 | |
454 //-------------------------------------------------------------- PrintHelp ---- | |
455 void PrintHelp (const char * s) | |
456 { | |
457 PrintUsage (s); | |
458 cerr | |
459 << "-f Output diff information as AMOS features\n" | |
460 << "-h Display help information\n" | |
461 << "-H Do not show header\n" | |
462 << "-q Show diff information for queries\n" | |
463 << "-r Show diff information for references (default)\n" | |
464 << endl; | |
465 | |
466 cerr | |
467 << " Outputs a list of structural differences for each sequence in\n" | |
468 << "the reference and query, sorted by position. For a reference\n" | |
469 << "sequence R, and its matching query sequence Q, differences are\n" | |
470 << "categorized as GAP (gap between two mutually consistent alignments),\n" | |
471 << "DUP (inserted duplication), BRK (other inserted sequence), JMP\n" | |
472 << "(rearrangement), INV (rearrangement with inversion), SEQ\n" | |
473 << "(rearrangement with another sequence). The first five columns of\n" | |
474 << "the output are seq ID, feature type, feature start, feature end,\n" | |
475 << "and feature length. Additional columns are added depending on the\n" | |
476 << "feature type. Negative feature lengths indicate overlapping adjacent\n" | |
477 << "alignment blocks.\n" | |
478 << " IDR GAP gap-start gap-end gap-length-R gap-length-Q gap-diff\n" | |
479 << " IDR DUP dup-start dup-end dup-length\n" | |
480 << " IDR BRK gap-start gap-end gap-length\n" | |
481 << " IDR JMP gap-start gap-end gap-length\n" | |
482 << " IDR INV gap-start gap-end gap-length\n" | |
483 << " IDR SEQ gap-start gap-end gap-length prev-sequence next-sequence\n" | |
484 << "Positions always reference the sequence with the given ID. The\n" | |
485 << "sum of the fifth column (ignoring negative values) is the total\n" | |
486 << "amount of inserted sequence. Summing the fifth column after removing\n" | |
487 << "DUP features is total unique inserted sequence. Note that unaligned\n" | |
488 << "sequence are not counted, and could represent additional \"unique\"\n" | |
489 << "sequences. See documentation for tips on how to interpret these\n" | |
490 << "alignment break features.\n" | |
491 << endl; | |
492 | |
493 return; | |
494 } | |
495 | |
496 | |
497 //------------------------------------------------------------- PrintUsage ---- | |
498 void PrintUsage (const char * s) | |
499 { | |
500 cerr | |
501 << "\nUSAGE: " << s << " [options] <deltafile>\n\n"; | |
502 return; | |
503 } |