comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-diff.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 //-----------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, University of Maryland
3 // File: show-diff.cc
4 // Date: 12 / 01 / 2006
5 //
6 // Usage: show-diff [options] <deltafile>
7 // Try 'show-diff -h' for more information
8 //
9 //-----------------------------------------------------------------------------
10
11 #include "delta.hh"
12 #include "tigrinc.hh"
13 #include <string>
14 #include <cstdlib>
15 #include <cassert>
16 #include <climits>
17 #include <algorithm>
18 #include <vector>
19
20 using namespace std;
21
22
23 //================================================================ Options ====
24 string OPT_AlignName; // delta file name
25 bool OPT_AMOS = false; // AMOS output
26 bool OPT_RefDiff = false; // output reference diff
27 bool OPT_QryDiff = false; // output query diff
28 bool OPT_PrintHeader = true; // -H option
29
30
31 //=========================================================== Declarations ====
32 struct EdgeletLoQCmp_t
33 //!< Sorts query by lo coord, lo to hi
34 {
35 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
36 { return ( i->loQ < j->loQ ); }
37 };
38
39 struct EdgeletIdQLoQCmp_t
40 //!< Sorts query by ID and lo coord, lo to hi
41 {
42 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
43 {
44 if ( i->edge && j->edge )
45 {
46 if ( i->edge->qrynode < j->edge->qrynode )
47 return true;
48 else if ( i->edge->qrynode > j->edge->qrynode )
49 return false;
50 }
51 else if ( !i->edge && j->edge )
52 return true;
53 else if ( i->edge && !j->edge )
54 return false;
55 return ( i->loQ < j->loQ );
56 }
57 };
58
59 struct EdgeletLoRCmp_t
60 //!< Sorts reference by lo coord, lo to hi
61 {
62 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
63 { return ( i->loR < j->loR ); }
64 };
65
66 struct EdgeletIdRLoRCmp_t
67 //!< Sorts reference by ID and lo coord, lo to hi
68 {
69 bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
70 {
71 if ( i->edge && j->edge )
72 {
73 if ( i->edge->refnode < j->edge->refnode )
74 return true;
75 else if ( i->edge->refnode > j->edge->refnode )
76 return false;
77 }
78 else if ( !i->edge && j->edge )
79 return true;
80 else if ( i->edge && !j->edge )
81 return false;
82 return ( i->loR < j->loR );
83 }
84 };
85
86
87 void PrintDiff(DeltaGraph_t & graph);
88 void PrintBrk(const char* seq, long s, long e);
89 void PrintSeqJmp(const char* seq,
90 const char* seqp, const char* seqn,
91 long s, long e);
92 void PrintLisJmp(const char* seq, long s, long e);
93 void PrintInv(const char* seq, long s, long e);
94 void PrintGap(const char* seq, long s, long e, long gap1, long gap2);
95 void PrintDup(const char* seq, long s, long e);
96 void ParseArgs(int argc, char ** argv);
97 void PrintHelp(const char * s);
98 void PrintUsage(const char * s);
99
100
101 //============================================================ Definitions ====
102 //------------------------------------------------------------------- main ----
103 int main(int argc, char **argv)
104 {
105 DeltaGraph_t graph;
106
107 ParseArgs(argc, argv);
108
109 //-- Get M-to-M alignment, i.e. union of QLIS and RLIS
110 graph.build(OPT_AlignName, false);
111 graph.flagMtoM();
112 graph.clean();
113
114 //-- Output diff
115 PrintDiff(graph);
116
117 return EXIT_SUCCESS;
118 }
119
120
121 //-------------------------------------------------------------- PrintDiff ----
122 void PrintDiff(DeltaGraph_t & graph)
123 {
124 if ( OPT_PrintHeader )
125 {
126 printf ("%s %s\n%s\n\n",
127 graph . refpath . c_str( ), graph . qrypath . c_str( ),
128 graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER");
129 printf ("%s\t%s\t%s\t%s\t%s\n",
130 "[SEQ]", "[TYPE]", "[S1]", "[E1]", "[LEN 1]");
131 }
132
133 const char* refid;
134 const char* qryid;
135 long i,j;
136 long nAligns, gapR, gapQ;
137 DeltaEdgelet_t lpad, rpad; // padding for the alignment vector
138 lpad.isRLIS = rpad.isRLIS = true;
139 lpad.isQLIS = rpad.isQLIS = true;
140 lpad.loR = lpad.hiR = lpad.loQ = lpad.hiQ = 0;
141
142 DeltaEdgelet_t *A, *PA, *PGA; // alignment, prev, prev global
143 vector<DeltaEdgelet_t *> aligns;
144
145 map<string, DeltaNode_t>::const_iterator mi;
146 vector<DeltaEdge_t *>::const_iterator ei;
147 vector<DeltaEdgelet_t *>::iterator eli;
148
149 //-- For each reference sequence
150 if ( OPT_RefDiff )
151 for ( mi = graph.refnodes.begin(); mi != graph.refnodes.end(); ++ mi )
152 {
153 refid = mi->first.c_str();
154
155 //-- Collect all alignments for this reference sequence
156 aligns.clear();
157 for ( ei = (mi->second).edges.begin();
158 ei != (mi->second).edges.end(); ++ei )
159 for ( eli = (*ei)->edgelets.begin();
160 eli != (*ei)->edgelets.end(); ++eli )
161 aligns.push_back(*eli);
162
163 //-- Pad the front and back of the alignment vector
164 rpad.loR = rpad.hiR = mi->second.len + 1;
165 rpad.loQ = rpad.hiQ = LONG_MAX;
166 aligns.push_back(&lpad);
167 aligns.push_back(&rpad);
168
169 nAligns = aligns.size();
170
171 //-- OVERRIDE *stpc* value with loQ QLIS ordering
172 sort(aligns.begin(), aligns.end(), EdgeletIdQLoQCmp_t());
173 for ( i = 0, j = 0; i != nAligns; ++i )
174 aligns[i]->stpc = aligns[i]->isQLIS ? j++ : -1;
175
176 //-- Sort by reference order
177 sort(aligns.begin(), aligns.end(), EdgeletLoRCmp_t());
178 assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );
179
180 //-- Walk reference cover alignments, low to high
181 PA = PGA = aligns[0];
182 for ( i = 1; i != nAligns; ++i )
183 {
184 //-- Only interested in reference covering alignments
185 if ( !aligns[i]->isRLIS ) continue;
186
187 A = aligns[i];
188 gapR = A->loR - PA->hiR - 1;
189
190 //-- Reached end of alignments
191 if ( A->edge == NULL )
192 {
193 PrintBrk(refid, PA->hiR+1, A->loR-1);
194 }
195 //-- 1-to-1 alignment
196 else if ( A->isQLIS && A->edge == PGA->edge )
197 {
198 //-- Jump within Q
199 if ( A->slope() != PGA->slope() ||
200 A->stpc != PGA->stpc + PGA->slope() )
201 {
202 if ( A->slope() == PGA->slope() )
203 PrintLisJmp(refid, PA->hiR+1, A->loR-1);
204 else
205 PrintInv(refid, PA->hiR+1, A->loR-1);
206 }
207 //-- Lined up, nothing in between
208 else if ( PA == PGA )
209 {
210 gapQ = A->isPositive() ?
211 A->loQ - PGA->hiQ - 1 :
212 PGA->loQ - A->hiQ - 1;
213 PrintGap(refid, PA->hiR+1, A->loR-1, gapR, gapQ);
214 }
215 //-- Lined up, duplication in between
216 else
217 {
218 PrintBrk(refid, PA->hiR+1, A->loR-1);
219 }
220 }
221 //-- Not in QLIS? Must be a duplication in R
222 else if ( !A->isQLIS )
223 {
224 PrintBrk(refid, PA->hiR+1, A->loR-1);
225 PrintDup(refid, A->loR, A->hiR);
226 }
227 //-- A->edge != PGA->edge? Jump to different query sequence
228 else if ( PGA->edge != NULL )
229 {
230 PrintSeqJmp(refid,
231 PGA->edge->qrynode->id->c_str(),
232 A->edge->qrynode->id->c_str(),
233 PA->hiR+1, A->loR-1);
234 }
235 //-- Gap before first alignment
236 else
237 {
238 PrintBrk(refid, PA->hiR+1, A->loR-1);
239 }
240
241 if ( A->isQLIS )
242 PGA = A;
243 PA = A;
244 }
245 }
246
247
248 //---------- WARNING! Same code as above but Q's for R's and R's for Q's
249 if ( OPT_QryDiff )
250 for ( mi = graph.qrynodes.begin(); mi != graph.qrynodes.end(); ++ mi )
251 {
252 qryid = mi->first.c_str();
253
254 aligns.clear();
255 for ( ei = (mi->second).edges.begin();
256 ei != (mi->second).edges.end(); ++ei )
257 for ( eli = (*ei)->edgelets.begin();
258 eli != (*ei)->edgelets.end(); ++eli )
259 aligns.push_back(*eli);
260
261 rpad.loQ = rpad.hiQ = mi->second.len + 1;
262 rpad.loR = rpad.hiR = LONG_MAX;
263 aligns.push_back(&lpad);
264 aligns.push_back(&rpad);
265
266 nAligns = aligns.size();
267
268 sort(aligns.begin(), aligns.end(), EdgeletIdRLoRCmp_t());
269 for ( i = 0, j = 0; i != nAligns; ++i )
270 aligns[i]->stpc = aligns[i]->isRLIS ? j++ : -1;
271
272 sort(aligns.begin(), aligns.end(), EdgeletLoQCmp_t());
273 assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );
274
275 PA = PGA = aligns[0];
276 for ( i = 1; i != nAligns; ++i )
277 {
278 if ( !aligns[i]->isQLIS ) continue;
279
280 A = aligns[i];
281 gapQ = A->loQ - PA->hiQ - 1;
282
283 if ( A->edge == NULL )
284 {
285 PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
286 }
287 else if ( A->isRLIS && A->edge == PGA->edge )
288 {
289 if ( A->slope() != PGA->slope() ||
290 A->stpc != PGA->stpc + PGA->slope() )
291 {
292 if ( A->slope() == PGA->slope() )
293 PrintLisJmp(qryid, PA->hiQ+1, A->loQ-1);
294 else
295 PrintInv(qryid, PA->hiQ+1, A->loQ-1);
296 }
297 else if ( PA == PGA )
298 {
299 gapR = A->isPositive() ?
300 A->loR - PGA->hiR - 1 :
301 PGA->loR - A->hiR - 1;
302 PrintGap(qryid, PA->hiQ+1, A->loQ-1, gapQ, gapR);
303 }
304 else
305 {
306 PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
307 }
308 }
309 else if ( !A->isRLIS )
310 {
311 PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
312 PrintDup(qryid, A->loQ, A->hiQ);
313 }
314 else if ( PGA->edge != NULL )
315 {
316 PrintSeqJmp(qryid,
317 PA->edge->refnode->id->c_str(),
318 A->edge->refnode->id->c_str(),
319 PA->hiQ+1, A->loQ-1);
320 }
321 else
322 {
323 PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
324 }
325
326 if ( A->isRLIS )
327 PGA = A;
328 PA = A;
329 }
330 }
331 }
332
333
334 void PrintBrk(const char* seq, long s, long e)
335 {
336 if ( e-s+1 <= 0 ) return;
337
338 if ( !OPT_AMOS )
339 printf("%s\tBRK\t%ld\t%ld\t%ld\n",
340 seq, s, e, e-s+1);
341 else
342 printf("%s\tA\t%ld\t%ld\tBRK\t%ld\t%ld\t%ld\n",
343 seq, s, e, s, e, e-s+1);
344 }
345
346 void PrintSeqJmp(const char* seq,
347 const char* seqp,
348 const char* seqn,
349 long s, long e)
350 {
351 if ( !OPT_AMOS )
352 printf("%s\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
353 seq, s, e, e-s+1, seqp, seqn);
354 else
355 printf("%s\tA\t%ld\t%ld\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
356 seq, s, e, s, e, e-s+1, seqp, seqn);
357 }
358
359 void PrintLisJmp(const char* seq, long s, long e)
360 {
361 if ( !OPT_AMOS )
362 printf("%s\tJMP\t%ld\t%ld\t%ld\n",
363 seq, s, e, e-s+1);
364 else
365 printf("%s\tA\t%ld\t%ld\tJMP\t%ld\t%ld\t%ld\n",
366 seq, s, e, s, e, e-s+1);
367 }
368
369 void PrintInv(const char* seq, long s, long e)
370 {
371 if ( !OPT_AMOS )
372 printf("%s\tINV\t%ld\t%ld\t%ld\n",
373 seq, s, e, e-s+1);
374 else
375 printf("%s\tA\t%ld\t%ld\tINV\t%ld\t%ld\t%ld\n",
376 seq, s, e, s, e, e-s+1);
377 }
378
379 void PrintGap(const char* seq, long s, long e, long gap1, long gap2)
380 {
381 if ( !OPT_AMOS )
382 printf("%s\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
383 seq, s, e, gap1, gap2, gap1-gap2);
384 else
385 printf("%s\tA\t%ld\t%ld\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
386 seq, s, e, s, e, gap1, gap2, gap1-gap2);
387 }
388
389 void PrintDup(const char* seq, long s, long e)
390 {
391 if ( !OPT_AMOS )
392 printf("%s\tDUP\t%ld\t%ld\t%ld\n",
393 seq, s, e, e-s+1);
394 else
395 printf("%s\tA\t%ld\t%ld\tDUP\t%ld\t%ld\t%ld\n",
396 seq, s, e, s, e, e-s+1);
397 }
398
399
400 //-------------------------------------------------------------- ParseArgs ----
401 void ParseArgs (int argc, char ** argv)
402 {
403 int ch, errflg = 0;
404 optarg = NULL;
405
406 while ( !errflg &&
407 ((ch = getopt (argc, argv, "fhHqr")) != EOF) )
408 switch (ch)
409 {
410 case 'f':
411 OPT_AMOS = true;
412 break;
413
414 case 'h':
415 PrintHelp (argv[0]);
416 exit (EXIT_SUCCESS);
417 break;
418
419 case 'H':
420 OPT_PrintHeader = false;
421 break;
422
423 case 'q':
424 OPT_QryDiff = true;
425 break;
426
427 case 'r':
428 OPT_RefDiff = true;
429 break;
430
431 default:
432 errflg ++;
433 }
434
435 if ( OPT_RefDiff && OPT_QryDiff )
436 {
437 cerr << "ERROR: -r and -q options cannot be combined\n";
438 errflg++;
439 }
440 if ( !OPT_RefDiff && !OPT_QryDiff )
441 OPT_RefDiff = true;
442
443 if ( errflg > 0 || optind != argc - 1 )
444 {
445 PrintUsage(argv[0]);
446 cerr << "Try '" << argv[0] << " -h' for more information.\n";
447 exit(EXIT_FAILURE);
448 }
449
450 OPT_AlignName = argv [optind ++];
451 }
452
453
454 //-------------------------------------------------------------- PrintHelp ----
455 void PrintHelp (const char * s)
456 {
457 PrintUsage (s);
458 cerr
459 << "-f Output diff information as AMOS features\n"
460 << "-h Display help information\n"
461 << "-H Do not show header\n"
462 << "-q Show diff information for queries\n"
463 << "-r Show diff information for references (default)\n"
464 << endl;
465
466 cerr
467 << " Outputs a list of structural differences for each sequence in\n"
468 << "the reference and query, sorted by position. For a reference\n"
469 << "sequence R, and its matching query sequence Q, differences are\n"
470 << "categorized as GAP (gap between two mutually consistent alignments),\n"
471 << "DUP (inserted duplication), BRK (other inserted sequence), JMP\n"
472 << "(rearrangement), INV (rearrangement with inversion), SEQ\n"
473 << "(rearrangement with another sequence). The first five columns of\n"
474 << "the output are seq ID, feature type, feature start, feature end,\n"
475 << "and feature length. Additional columns are added depending on the\n"
476 << "feature type. Negative feature lengths indicate overlapping adjacent\n"
477 << "alignment blocks.\n"
478 << " IDR GAP gap-start gap-end gap-length-R gap-length-Q gap-diff\n"
479 << " IDR DUP dup-start dup-end dup-length\n"
480 << " IDR BRK gap-start gap-end gap-length\n"
481 << " IDR JMP gap-start gap-end gap-length\n"
482 << " IDR INV gap-start gap-end gap-length\n"
483 << " IDR SEQ gap-start gap-end gap-length prev-sequence next-sequence\n"
484 << "Positions always reference the sequence with the given ID. The\n"
485 << "sum of the fifth column (ignoring negative values) is the total\n"
486 << "amount of inserted sequence. Summing the fifth column after removing\n"
487 << "DUP features is total unique inserted sequence. Note that unaligned\n"
488 << "sequence are not counted, and could represent additional \"unique\"\n"
489 << "sequences. See documentation for tips on how to interpret these\n"
490 << "alignment break features.\n"
491 << endl;
492
493 return;
494 }
495
496
497 //------------------------------------------------------------- PrintUsage ----
498 void PrintUsage (const char * s)
499 {
500 cerr
501 << "\nUSAGE: " << s << " [options] <deltafile>\n\n";
502 return;
503 }