Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-snps.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 //------------------------------------------------------------------------------ | |
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research | |
3 // File: show-snps.cc | |
4 // Date: 12 / 08 / 2004 | |
5 // | |
6 // Usage: show-snps [options] <deltafile> | |
7 // Try 'show-snps -h' for more information | |
8 // | |
9 // Description: For use in conjunction with the MUMmer package. "show-snps" | |
10 // displays human readable (and machine parse-able) single | |
11 // base-pair polymorphisms, including indels from the .delta output | |
12 // of the "nucmer" program. Outputs SNP positions and relative | |
13 // information to stdout. | |
14 // | |
15 //------------------------------------------------------------------------------ | |
16 | |
17 #include "delta.hh" | |
18 #include "tigrinc.hh" | |
19 #include "translate.hh" | |
20 #include "sw_alignscore.hh" | |
21 #include <vector> | |
22 #include <algorithm> | |
23 #include <string> | |
24 #include <sstream> | |
25 #include <cstring> | |
26 #include <map> | |
27 #include <set> | |
28 #include <cstdio> | |
29 using namespace std; | |
30 | |
31 | |
32 | |
33 | |
34 //=============================================================== Options ====// | |
35 string OPT_AlignName; // delta file name | |
36 string OPT_ReferenceName; // reference sequence file name | |
37 string OPT_QueryName; // query sequence file name | |
38 | |
39 bool OPT_SortReference = false; // -r option | |
40 bool OPT_SortQuery = false; // -q option | |
41 bool OPT_ShowLength = false; // -l option | |
42 bool OPT_ShowConflict = true; // -C option | |
43 bool OPT_ShowIndels = true; // -I option | |
44 bool OPT_PrintTabular = false; // -T option | |
45 bool OPT_PrintHeader = true; // -H option | |
46 bool OPT_SelectAligns = false; // -S option | |
47 | |
48 int OPT_Context = 0; // -x option | |
49 | |
50 set<string> OPT_Aligns; // -S option | |
51 | |
52 | |
53 | |
54 | |
55 //============================================================= Constants ====// | |
56 const char INDEL_CHAR = '.'; | |
57 const char SEQEND_CHAR = '-'; | |
58 | |
59 | |
60 | |
61 struct SNP_R_Sort | |
62 { | |
63 bool operator() (const SNP_t * a, const SNP_t * b) | |
64 { | |
65 int i = a->ep->refnode->id->compare (*(b->ep->refnode->id)); | |
66 | |
67 if ( i < 0 ) | |
68 return true; | |
69 else if ( i > 0 ) | |
70 return false; | |
71 else | |
72 { | |
73 if ( a -> pR < b -> pR ) | |
74 return true; | |
75 else if ( a -> pR > b -> pR ) | |
76 return false; | |
77 else | |
78 { | |
79 int j = a->ep->qrynode->id->compare (*(b->ep->qrynode->id)); | |
80 | |
81 if ( j < 0 ) | |
82 return true; | |
83 else if ( j > 0 ) | |
84 return false; | |
85 else | |
86 { | |
87 if ( a -> pQ < b -> pQ ) | |
88 return true; | |
89 else | |
90 return false; | |
91 } | |
92 } | |
93 } | |
94 } | |
95 }; | |
96 | |
97 | |
98 struct SNP_Q_Sort | |
99 { | |
100 bool operator() (const SNP_t * a, const SNP_t * b) | |
101 { | |
102 int i = a->ep->qrynode->id->compare (*(b->ep->qrynode->id)); | |
103 | |
104 if ( i < 0 ) | |
105 return true; | |
106 else if ( i > 0 ) | |
107 return false; | |
108 else | |
109 { | |
110 if ( a -> pQ < b -> pQ ) | |
111 return true; | |
112 else if ( a -> pQ > b -> pQ ) | |
113 return false; | |
114 else | |
115 { | |
116 int j = a->ep->refnode->id->compare (*(b->ep->refnode->id)); | |
117 | |
118 if ( j < 0 ) | |
119 return true; | |
120 else if ( j > 0 ) | |
121 return false; | |
122 else | |
123 { | |
124 if ( a -> pR < b -> pR ) | |
125 return true; | |
126 else | |
127 return false; | |
128 } | |
129 } | |
130 } | |
131 } | |
132 }; | |
133 | |
134 | |
135 | |
136 | |
137 //========================================================== Fuction Decs ====// | |
138 //------------------------------------------------------------------ RevC ----// | |
139 inline long RevC (long coord, long len) | |
140 { | |
141 return len - coord + 1; | |
142 } | |
143 | |
144 | |
145 //------------------------------------------------------------------ Norm ----// | |
146 inline long Norm (long c, long l, int f, AlignmentType_t d) | |
147 { | |
148 long retval = (d == PROMER_DATA ? c * 3 - (3 - abs(f)) : c); | |
149 if ( f < 0 ) retval = RevC (retval, l); | |
150 return retval; | |
151 } | |
152 | |
153 | |
154 //------------------------------------------------------------------ Swap ----// | |
155 inline void Swap (long & a, long & b) | |
156 { | |
157 long t = a; a = b; b = t; | |
158 } | |
159 | |
160 | |
161 //------------------------------------------------------------- CheckSNPs ----// | |
162 void CheckSNPs (DeltaGraph_t & graph); | |
163 | |
164 | |
165 //-------------------------------------------------------------- FindSNPs ----// | |
166 void FindSNPs (DeltaGraph_t & graph); | |
167 | |
168 | |
169 //------------------------------------------------------------ PrintHuman ----// | |
170 void PrintHuman (const vector<const SNP_t *> & snps, | |
171 const DeltaGraph_t & graph); | |
172 | |
173 | |
174 //---------------------------------------------------------- PrintTabular ----// | |
175 void PrintTabular (const vector<const SNP_t *> & snps, | |
176 const DeltaGraph_t & graph); | |
177 | |
178 | |
179 //---------------------------------------------------------- SelectAligns ----// | |
180 void SelectAligns ( ); | |
181 | |
182 | |
183 //------------------------------------------------------------- ParseArgs ----// | |
184 void ParseArgs (int argc, char ** argv); | |
185 | |
186 | |
187 //------------------------------------------------------------- PrintHelp ----// | |
188 void PrintHelp (const char * s); | |
189 | |
190 | |
191 //------------------------------------------------------------ PrintUsage ----// | |
192 void PrintUsage (const char * s); | |
193 | |
194 | |
195 | |
196 | |
197 //========================================================= Function Defs ====// | |
198 int main (int argc, char ** argv) | |
199 { | |
200 vector<const SNP_t *> snps; | |
201 DeltaGraph_t graph; | |
202 | |
203 | |
204 //-- Command line parsing | |
205 ParseArgs (argc, argv); | |
206 | |
207 //-- Select alignments | |
208 if ( OPT_SelectAligns ) | |
209 SelectAligns ( ); | |
210 | |
211 //-- Build the alignment graph from the delta file | |
212 graph . build (OPT_AlignName, true); | |
213 | |
214 //-- Read sequences | |
215 graph . loadSequences ( ); | |
216 | |
217 //-- Locate the SNPs | |
218 FindSNPs (graph); | |
219 | |
220 //-- Check for ambiguous alignment regions | |
221 CheckSNPs (graph); | |
222 | |
223 | |
224 //-- Collect and sort the SNPs | |
225 map<string, DeltaNode_t>::iterator mi; | |
226 vector<DeltaEdge_t *>::iterator ei; | |
227 vector<DeltaEdgelet_t *>::iterator li; | |
228 vector<SNP_t *>::iterator si; | |
229 for ( mi = graph.refnodes.begin( ); mi != graph.refnodes.end( ); ++ mi ) | |
230 for ( ei = mi->second.edges.begin( ); ei != mi->second.edges.end( ); ++ ei ) | |
231 for ( li = (*ei)->edgelets.begin( ); li != (*ei)->edgelets.end( ); ++ li ) | |
232 for ( si = (*li)->snps.begin( ); si != (*li)->snps.end( ); ++ si ) | |
233 if ( (OPT_ShowConflict || | |
234 ((*si)->conR == 0 && (*si)->conQ == 0)) | |
235 && | |
236 (OPT_ShowIndels || | |
237 ((*si)->cR != INDEL_CHAR && (*si)->cQ != INDEL_CHAR)) ) | |
238 snps . push_back (*si); | |
239 | |
240 if ( OPT_SortReference ) | |
241 sort (snps . begin( ), snps . end( ), SNP_R_Sort( )); | |
242 else | |
243 sort (snps . begin( ), snps . end( ), SNP_Q_Sort( )); | |
244 | |
245 | |
246 //-- Output data to stdout | |
247 if ( OPT_PrintTabular ) | |
248 PrintTabular (snps, graph); | |
249 else | |
250 PrintHuman (snps, graph); | |
251 | |
252 | |
253 return EXIT_SUCCESS; | |
254 } | |
255 | |
256 | |
257 | |
258 //------------------------------------------------------------- CheckSNPs ----// | |
259 void CheckSNPs (DeltaGraph_t & graph) | |
260 { | |
261 map<string, DeltaNode_t>::const_iterator mi; | |
262 vector<DeltaEdge_t *>::const_iterator ei; | |
263 vector<DeltaEdgelet_t *>::iterator eli; | |
264 vector<SNP_t *>::iterator si; | |
265 long i; | |
266 | |
267 //-- For each reference sequence | |
268 long ref_size = 0; | |
269 long ref_len = 0; | |
270 unsigned char * ref_cov = NULL; | |
271 for ( mi = graph.refnodes.begin( ); mi != graph.refnodes.end( ); ++ mi ) | |
272 { | |
273 //-- Reset the reference coverage array | |
274 ref_len = (mi -> second) . len; | |
275 if ( ref_len > ref_size ) | |
276 { | |
277 ref_cov = (unsigned char *) Safe_realloc (ref_cov, ref_len + 1); | |
278 ref_size = ref_len; | |
279 } | |
280 for ( i = 1; i <= ref_len; ++ i ) | |
281 ref_cov[i] = 0; | |
282 | |
283 //-- Add to the reference coverage | |
284 for ( ei = (mi -> second) . edges . begin( ); | |
285 ei != (mi -> second) . edges . end( ); ++ ei ) | |
286 for ( eli = (*ei) -> edgelets . begin( ); | |
287 eli != (*ei) -> edgelets . end( ); ++ eli ) | |
288 for ( i = (*eli) -> loR; i <= (*eli) -> hiR; i ++ ) | |
289 if ( ref_cov [i] < UCHAR_MAX ) | |
290 ref_cov [i] ++; | |
291 | |
292 //-- Set the SNP conflict counter | |
293 for ( ei = (mi -> second) . edges . begin( ); | |
294 ei != (mi -> second) . edges . end( ); ++ ei ) | |
295 for ( eli = (*ei) -> edgelets . begin( ); | |
296 eli != (*ei) -> edgelets . end( ); ++ eli ) | |
297 for ( si = (*eli)->snps.begin( ); si != (*eli)->snps.end( ); ++ si ) | |
298 (*si) -> conR = ref_cov [(*si)->pR] - 1; | |
299 } | |
300 free (ref_cov); | |
301 | |
302 | |
303 //-- For each query sequence | |
304 long qry_size = 0; | |
305 long qry_len = 0; | |
306 unsigned char * qry_cov = NULL; | |
307 for ( mi = graph.qrynodes.begin( ); mi != graph.qrynodes.end( ); ++ mi ) | |
308 { | |
309 //-- Reset the query coverage array | |
310 qry_len = (mi -> second) . len; | |
311 if ( qry_len > qry_size ) | |
312 { | |
313 qry_cov = (unsigned char *) Safe_realloc (qry_cov, qry_len + 1); | |
314 qry_size = qry_len; | |
315 } | |
316 for ( i = 1; i <= qry_len; ++ i ) | |
317 qry_cov[i] = 0; | |
318 | |
319 //-- Add to the query coverage | |
320 for ( ei = (mi -> second) . edges . begin( ); | |
321 ei != (mi -> second) . edges . end( ); ++ ei ) | |
322 for ( eli = (*ei) -> edgelets . begin( ); | |
323 eli != (*ei) -> edgelets . end( ); ++ eli ) | |
324 for ( i = (*eli) -> loQ; i <= (*eli) -> hiQ; i ++ ) | |
325 if ( qry_cov [i] < UCHAR_MAX ) | |
326 qry_cov [i] ++; | |
327 | |
328 //-- Set the SNP conflict counter | |
329 for ( ei = (mi -> second) . edges . begin( ); | |
330 ei != (mi -> second) . edges . end( ); ++ ei ) | |
331 for ( eli = (*ei) -> edgelets . begin( ); | |
332 eli != (*ei) -> edgelets . end( ); ++ eli ) | |
333 for ( si = (*eli)->snps.begin( ); si != (*eli)->snps.end( ); ++ si ) | |
334 (*si) -> conQ = qry_cov [(*si)->pQ] - 1; | |
335 } | |
336 free (qry_cov); | |
337 } | |
338 | |
339 | |
340 | |
341 | |
342 //-------------------------------------------------------------- FindSNPs ----// | |
343 void FindSNPs (DeltaGraph_t & graph) | |
344 { | |
345 map<string, DeltaNode_t>::iterator mi; | |
346 vector<DeltaEdge_t *>::iterator ei; | |
347 vector<DeltaEdgelet_t *>::iterator li; | |
348 vector<SNP_t *>::iterator si, psi, nsi; | |
349 | |
350 //-- For each alignment, identify the SNPs | |
351 for ( mi = graph.refnodes.begin( ); mi != graph.refnodes.end( ); ++ mi ) | |
352 for ( ei = mi->second.edges.begin( ); ei != mi->second.edges.end( ); ++ ei ) | |
353 { | |
354 SNP_t * snp; | |
355 int ri, qi; | |
356 char * R[] = {(*ei)->refnode->seq, NULL, NULL, NULL, NULL, NULL, NULL}; | |
357 char * Q[] = {(*ei)->qrynode->seq, NULL, NULL, NULL, NULL, NULL, NULL}; | |
358 | |
359 long i; | |
360 long lenR = (*ei) -> refnode -> len; | |
361 long lenQ = (*ei) -> qrynode -> len; | |
362 | |
363 for (li = (*ei)->edgelets.begin( ); li != (*ei)->edgelets.end( ); ++ li) | |
364 { | |
365 long delta; | |
366 int frameR, frameQ, sign; | |
367 long sR, eR, sQ, eQ; | |
368 long rpos, qpos, remain; | |
369 long rctx, qctx; | |
370 long alenR = lenR; | |
371 long alenQ = lenQ; | |
372 | |
373 //-- Only do the ones requested by user | |
374 if ( OPT_SelectAligns ) | |
375 { | |
376 ostringstream ss; | |
377 set<string>::iterator si; | |
378 | |
379 if ( (*li) -> dirR == FORWARD_DIR ) | |
380 ss << (*li) -> loR << ' ' << (*li) -> hiR << ' '; | |
381 else | |
382 ss << (*li) -> hiR << ' ' << (*li) -> loR << ' '; | |
383 | |
384 if ( (*li) -> dirQ == FORWARD_DIR ) | |
385 ss << (*li) -> loQ << ' ' << (*li) -> hiQ << ' '; | |
386 else | |
387 ss << (*li) -> hiQ << ' ' << (*li) -> loQ << ' '; | |
388 | |
389 ss << *((*ei)->refnode->id) << ' ' << *((*ei)->qrynode->id); | |
390 | |
391 si = OPT_Aligns . find (ss .str( )); | |
392 if ( si == OPT_Aligns . end( ) ) | |
393 continue; | |
394 else | |
395 OPT_Aligns . erase (si); | |
396 } | |
397 | |
398 //-- Point the coords the right direction | |
399 frameR = 1; | |
400 if ( (*li) -> dirR == REVERSE_DIR ) | |
401 { | |
402 sR = RevC ((*li) -> hiR, lenR); | |
403 eR = RevC ((*li) -> loR, lenR); | |
404 frameR += 3; | |
405 } | |
406 else | |
407 { | |
408 sR = (*li) -> loR; | |
409 eR = (*li) -> hiR; | |
410 } | |
411 | |
412 frameQ = 1; | |
413 if ( (*li) -> dirQ == REVERSE_DIR ) | |
414 { | |
415 sQ = RevC ((*li) -> hiQ, lenQ); | |
416 eQ = RevC ((*li) -> loQ, lenQ); | |
417 frameQ += 3; | |
418 } | |
419 else | |
420 { | |
421 sQ = (*li) -> loQ; | |
422 eQ = (*li) -> hiQ; | |
423 } | |
424 | |
425 //-- Translate coords to AA if necessary | |
426 if ( graph . datatype == PROMER_DATA ) | |
427 { | |
428 alenR /= 3; | |
429 alenQ /= 3; | |
430 | |
431 frameR += (sR + 2) % 3; | |
432 frameQ += (sQ + 2) % 3; | |
433 | |
434 // remeber that eR and eQ point to the last base in the codon | |
435 sR = (sR + 2) / 3; | |
436 eR = eR / 3; | |
437 sQ = (sQ + 2) / 3; | |
438 eQ = eQ / 3; | |
439 } | |
440 | |
441 ri = frameR; | |
442 qi = frameQ; | |
443 | |
444 if ( frameR > 3 ) | |
445 frameR = -(frameR - 3); | |
446 if ( frameQ > 3 ) | |
447 frameQ = -(frameQ - 3); | |
448 | |
449 //-- Generate the sequences if needed | |
450 if ( R [ri] == NULL ) | |
451 { | |
452 if ( graph . datatype == PROMER_DATA ) | |
453 { | |
454 R [ri] = (char *) Safe_malloc (alenR + 2); | |
455 R [ri][0] = '\0'; | |
456 Translate_DNA (R [0], R [ri], ri); | |
457 } | |
458 else | |
459 { | |
460 R [ri] = (char *) Safe_malloc (alenR + 2); | |
461 R [ri][0] = '\0'; | |
462 strcpy (R [ri] + 1, R [0] + 1); | |
463 if ( (*li) -> dirR == REVERSE_DIR ) | |
464 Reverse_Complement (R [ri], 1, lenR); | |
465 } | |
466 } | |
467 if ( Q [qi] == NULL ) | |
468 { | |
469 if ( graph . datatype == PROMER_DATA ) | |
470 { | |
471 Q [qi] = (char *) Safe_malloc (alenQ + 2); | |
472 Q [qi][0] = '\0'; | |
473 Translate_DNA (Q [0], Q [qi], qi); | |
474 } | |
475 else | |
476 { | |
477 Q [qi] = (char *) Safe_malloc (alenQ + 2); | |
478 Q [qi][0] = '\0'; | |
479 strcpy (Q [qi] + 1, Q [0] + 1); | |
480 if ( (*li) -> dirQ == REVERSE_DIR ) | |
481 Reverse_Complement (Q [qi], 1, lenQ); | |
482 } | |
483 } | |
484 | |
485 //-- Locate the SNPs | |
486 rpos = sR; | |
487 qpos = sQ; | |
488 remain = eR - sR + 1; | |
489 | |
490 (*li) -> frmR = frameR; | |
491 (*li) -> frmQ = frameQ; | |
492 | |
493 istringstream ss; | |
494 ss . str ((*li)->delta); | |
495 | |
496 while ( ss >> delta && delta != 0 ) | |
497 { | |
498 sign = delta > 0 ? 1 : -1; | |
499 delta = labs (delta); | |
500 | |
501 //-- For all SNPs before the next indel | |
502 for ( i = 1; i < delta; i ++ ) | |
503 if ( R [ri] [rpos ++] != Q [qi] [qpos ++] ) | |
504 { | |
505 if ( graph.datatype == NUCMER_DATA && | |
506 CompareIUPAC (R [ri][rpos-1], Q [qi][qpos-1]) ) | |
507 continue; | |
508 | |
509 snp = new SNP_t; | |
510 snp -> ep = *ei; | |
511 snp -> lp = *li; | |
512 snp -> pR = Norm (rpos-1, lenR, frameR, graph.datatype); | |
513 snp -> pQ = Norm (qpos-1, lenQ, frameQ, graph.datatype); | |
514 snp -> cR = toupper (R [ri] [rpos-1]); | |
515 snp -> cQ = toupper (Q [qi] [qpos-1]); | |
516 | |
517 for ( rctx = rpos - OPT_Context - 1; | |
518 rctx < rpos + OPT_Context; rctx ++ ) | |
519 if ( rctx < 1 || rctx > alenR ) | |
520 snp -> ctxR . push_back (SEQEND_CHAR); | |
521 else if ( rctx == rpos - 1 ) | |
522 snp -> ctxR . push_back (snp -> cR); | |
523 else | |
524 snp -> ctxR . push_back (toupper (R [ri] [rctx])); | |
525 | |
526 for ( qctx = qpos - OPT_Context - 1; | |
527 qctx < qpos + OPT_Context; qctx ++ ) | |
528 if ( qctx < 1 || qctx > alenQ ) | |
529 snp -> ctxQ . push_back (SEQEND_CHAR); | |
530 else if ( qctx == qpos - 1 ) | |
531 snp -> ctxQ . push_back (snp -> cQ); | |
532 else | |
533 snp -> ctxQ . push_back (toupper (Q [qi] [qctx])); | |
534 | |
535 (*li) -> snps . push_back (snp); | |
536 } | |
537 | |
538 //-- For the indel | |
539 snp = new SNP_t; | |
540 snp -> ep = *ei; | |
541 snp -> lp = *li; | |
542 | |
543 for ( rctx = rpos - OPT_Context; rctx < rpos; rctx ++ ) | |
544 if ( rctx < 1 ) | |
545 snp -> ctxR . push_back (SEQEND_CHAR); | |
546 else | |
547 snp -> ctxR . push_back (toupper (R [ri] [rctx])); | |
548 | |
549 for ( qctx = qpos - OPT_Context; qctx < qpos; qctx ++ ) | |
550 if ( qctx < 1 ) | |
551 snp -> ctxQ . push_back (SEQEND_CHAR); | |
552 else | |
553 snp -> ctxQ . push_back (toupper (Q [qi] [qctx])); | |
554 | |
555 if ( sign > 0 ) | |
556 { | |
557 snp -> pR = Norm (rpos, lenR, frameR, graph.datatype); | |
558 if ( frameQ > 0 ) | |
559 snp -> pQ = Norm (qpos - 1, lenQ, frameQ, graph.datatype); | |
560 else | |
561 snp -> pQ = Norm (qpos, lenQ, frameQ, graph.datatype); | |
562 | |
563 snp -> cR = toupper (R [ri] [rpos ++]); | |
564 snp -> cQ = INDEL_CHAR; | |
565 | |
566 remain -= i; | |
567 rctx ++; | |
568 } | |
569 else | |
570 { | |
571 snp -> pQ = Norm (qpos, lenQ, frameQ, graph.datatype); | |
572 if ( frameR > 0 ) | |
573 snp -> pR = Norm (rpos - 1, lenR, frameR, graph.datatype); | |
574 else | |
575 snp -> pR = Norm (rpos, lenR, frameR, graph.datatype); | |
576 | |
577 snp -> cR = INDEL_CHAR; | |
578 snp -> cQ = toupper (Q [qi] [qpos ++]); | |
579 | |
580 remain -= i - 1; | |
581 qctx ++; | |
582 } | |
583 | |
584 snp -> ctxR . push_back (snp -> cR); | |
585 for ( ; rctx < rpos + OPT_Context; rctx ++ ) | |
586 if ( rctx > alenR ) | |
587 snp -> ctxR . push_back (SEQEND_CHAR); | |
588 else | |
589 snp -> ctxR . push_back (toupper (R [ri] [rctx])); | |
590 | |
591 snp -> ctxQ . push_back (snp -> cQ); | |
592 for ( ; qctx < qpos + OPT_Context; qctx ++ ) | |
593 if ( qctx > alenQ ) | |
594 snp -> ctxQ . push_back (SEQEND_CHAR); | |
595 else | |
596 snp -> ctxQ . push_back (toupper (Q [qi] [qctx])); | |
597 | |
598 (*li) -> snps . push_back (snp); | |
599 } | |
600 | |
601 //-- For all SNPs after the final indel | |
602 for ( i = 0; i < remain; i ++ ) | |
603 if ( R [ri] [rpos ++] != Q [qi] [qpos ++] ) | |
604 { | |
605 if ( graph.datatype == NUCMER_DATA && | |
606 CompareIUPAC (R [ri][rpos-1], Q [qi][qpos-1]) ) | |
607 continue; | |
608 | |
609 snp = new SNP_t; | |
610 snp -> ep = *ei; | |
611 snp -> lp = *li; | |
612 snp -> pR = Norm (rpos-1, lenR, frameR, graph.datatype); | |
613 snp -> pQ = Norm (qpos-1, lenQ, frameQ, graph.datatype); | |
614 snp -> cR = toupper (R [ri] [rpos-1]); | |
615 snp -> cQ = toupper (Q [qi] [qpos-1]); | |
616 | |
617 for ( rctx = rpos - OPT_Context - 1; | |
618 rctx < rpos + OPT_Context; rctx ++ ) | |
619 if ( rctx < 1 || rctx > alenR ) | |
620 snp -> ctxR . push_back (SEQEND_CHAR); | |
621 else if ( rctx == rpos - 1 ) | |
622 snp -> ctxR . push_back (snp -> cR); | |
623 else | |
624 snp -> ctxR . push_back (toupper (R [ri] [rctx])); | |
625 | |
626 for ( qctx = qpos - OPT_Context - 1; | |
627 qctx < qpos + OPT_Context; qctx ++ ) | |
628 if ( qctx < 1 || qctx > alenQ ) | |
629 snp -> ctxQ . push_back (SEQEND_CHAR); | |
630 else if ( qctx == qpos - 1 ) | |
631 snp -> ctxQ . push_back (snp -> cQ); | |
632 else | |
633 snp -> ctxQ . push_back (toupper (Q [qi] [qctx])); | |
634 | |
635 (*li) -> snps . push_back (snp); | |
636 } | |
637 | |
638 | |
639 //-- Sort SNPs and calculate distances | |
640 if ( OPT_SortReference ) | |
641 { | |
642 sort ((*li)->snps.begin( ), (*li)->snps.end( ), SNP_R_Sort( )); | |
643 | |
644 for ( si = (*li)->snps.begin(); si != (*li)->snps.end(); ++ si ) | |
645 { | |
646 psi = si - 1; | |
647 nsi = si + 1; | |
648 | |
649 (*si) -> buff = 1 + | |
650 ((*si)->pR - (*li)->loR < (*li)->hiR - (*si)->pR ? | |
651 (*si)->pR - (*li)->loR : (*li)->hiR - (*si)->pR); | |
652 | |
653 if ( psi >= (*li) -> snps . begin( ) && | |
654 (*si)->pR - (*psi)->pR < (*si)->buff ) | |
655 (*si) -> buff = (*si)->pR - (*psi)->pR; | |
656 | |
657 if ( nsi < (*li) -> snps . end( ) && | |
658 (*nsi)->pR - (*si)->pR < (*si)->buff ) | |
659 (*si) -> buff = (*nsi)->pR - (*si)->pR; | |
660 } | |
661 } | |
662 else | |
663 { | |
664 sort ((*li)->snps.begin( ), (*li)->snps.end( ), SNP_Q_Sort( )); | |
665 | |
666 for ( si = (*li)->snps.begin(); si != (*li)->snps.end(); ++ si ) | |
667 { | |
668 psi = si - 1; | |
669 nsi = si + 1; | |
670 | |
671 (*si) -> buff = 1 + | |
672 ((*si)->pQ - (*li)->loQ < (*li)->hiQ - (*si)->pQ ? | |
673 (*si)->pQ - (*li)->loQ : (*li)->hiQ - (*si)->pQ); | |
674 | |
675 if ( psi >= (*li) -> snps . begin( ) && | |
676 (*si)->pQ - (*psi)->pQ < (*si)->buff ) | |
677 (*si) -> buff = (*si)->pQ - (*psi)->pQ; | |
678 | |
679 if ( nsi < (*li) -> snps . end( ) && | |
680 (*nsi)->pQ - (*si)->pQ < (*si)->buff ) | |
681 (*si) -> buff = (*nsi)->pQ - (*si)->pQ; | |
682 } | |
683 } | |
684 } | |
685 | |
686 //-- Clear up the seq | |
687 for ( i = 1; i <= 6; i ++ ) | |
688 { | |
689 free (R[i]); | |
690 free (Q[i]); | |
691 } | |
692 } | |
693 | |
694 if ( OPT_SelectAligns && ! OPT_Aligns . empty( ) ) | |
695 { | |
696 cerr << "ERROR: One or more alignments from stdin could not be found\n"; | |
697 exit (EXIT_FAILURE); | |
698 } | |
699 } | |
700 | |
701 | |
702 | |
703 | |
704 //------------------------------------------------------------ PrintHuman ----// | |
705 void PrintHuman (const vector<const SNP_t *> & snps, | |
706 const DeltaGraph_t & graph) | |
707 { | |
708 vector<const SNP_t *>::const_iterator si; | |
709 long dist, distR, distQ; | |
710 int ctxw = 2 * OPT_Context + 1; | |
711 int ctxc = ctxw < 7 ? 7 : ctxw; | |
712 | |
713 if ( OPT_PrintHeader ) | |
714 { | |
715 printf ("%s %s\n%s\n\n", | |
716 graph . refpath . c_str( ), graph . qrypath . c_str( ), | |
717 graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER"); | |
718 printf ("%8s %5s %-8s | ", "[P1]", "[SUB]", "[P2]"); | |
719 printf ("%8s %8s | ", "[BUFF]", "[DIST]"); | |
720 if ( OPT_ShowConflict ) | |
721 printf ("%4s %4s | ", "[R]", "[Q]"); | |
722 if ( OPT_ShowLength ) | |
723 printf ("%8s %8s | ", "[LEN R]", "[LEN Q]"); | |
724 if ( OPT_Context ) | |
725 { | |
726 for ( int i = 0; i < ctxc - 7; i ++ ) | |
727 putchar (' '); | |
728 printf (" [CTX R] "); | |
729 for ( int i = 0; i < ctxc - 7; i ++ ) | |
730 putchar (' '); | |
731 printf ("[CTX Q] | "); | |
732 } | |
733 printf ("%5s ", "[FRM]"); | |
734 printf ("%s", "[TAGS]"); | |
735 printf ("\n"); | |
736 | |
737 if ( OPT_ShowConflict ) | |
738 printf ("============="); | |
739 if ( OPT_ShowLength ) | |
740 printf ("====================="); | |
741 if ( OPT_Context ) | |
742 for ( int i = 0; i < 2 * ctxc + 7; i ++ ) | |
743 putchar ('='); | |
744 printf("=================================" | |
745 "==================================\n"); | |
746 } | |
747 | |
748 for ( si = snps . begin( ); si != snps . end( ); ++ si ) | |
749 { | |
750 distR = (*si)->pR < (signed long)(*si)->ep->refnode->len - (*si)->pR + 1 ? | |
751 (*si)->pR : (*si)->ep->refnode->len - (*si)->pR + 1; | |
752 distQ = (*si)->pQ < (signed long)(*si)->ep->qrynode->len - (*si)->pQ + 1 ? | |
753 (*si)->pQ : (*si)->ep->qrynode->len - (*si)->pQ + 1; | |
754 dist = distR < distQ ? distR : distQ; | |
755 | |
756 printf ("%8ld %c %c %-8ld | ", | |
757 (*si)->pR, (*si)->cR, (*si)->cQ, (*si)->pQ); | |
758 printf ("%8ld %8ld | ", (*si)->buff, dist); | |
759 if ( OPT_ShowConflict ) | |
760 printf ("%4d %4d | ", (*si)->conR, (*si)->conQ); | |
761 if ( OPT_ShowLength ) | |
762 printf ("%8ld %8ld | ", | |
763 (*si)->ep->refnode->len, (*si)->ep->qrynode->len); | |
764 if ( OPT_Context ) | |
765 { | |
766 for ( int i = 0; i < ctxc - ctxw; i ++ ) | |
767 putchar (' '); | |
768 printf (" %s ", (*si)->ctxR.c_str( )); | |
769 for ( int i = 0; i < ctxc - ctxw; i ++ ) | |
770 putchar (' '); | |
771 printf ("%s | ", (*si)->ctxQ.c_str( )); | |
772 } | |
773 printf ("%2d %2d ", (*si)->lp->frmR, (*si)->lp->frmQ); | |
774 printf ("%s\t%s", | |
775 (*si)->ep->refnode->id->c_str( ), | |
776 (*si)->ep->qrynode->id->c_str( )); | |
777 printf ("\n"); | |
778 } | |
779 } | |
780 | |
781 | |
782 | |
783 | |
784 //---------------------------------------------------------- PrintTabular ----// | |
785 void PrintTabular (const vector<const SNP_t *> & snps, | |
786 const DeltaGraph_t & graph) | |
787 { | |
788 vector<const SNP_t *>::const_iterator si; | |
789 long dist, distR, distQ; | |
790 | |
791 if ( OPT_PrintHeader ) | |
792 { | |
793 printf ("%s %s\n%s\n\n", | |
794 graph . refpath . c_str( ), graph . qrypath . c_str( ), | |
795 graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER"); | |
796 printf ("%s\t%s\t%s\t%s\t", "[P1]", "[SUB]", "[SUB]", "[P2]"); | |
797 printf ("%s\t%s\t", "[BUFF]", "[DIST]"); | |
798 if ( OPT_ShowConflict ) | |
799 printf ("%s\t%s\t", "[R]", "[Q]"); | |
800 if ( OPT_ShowLength ) | |
801 printf ("%s\t%s\t", "[LEN R]", "[LEN Q]"); | |
802 if ( OPT_Context ) | |
803 printf ("%s\t%s\t", "[CTX R]", "[CTX Q]"); | |
804 printf ("%s\t", "[FRM]"); | |
805 printf ("%s\n", "[TAGS]"); | |
806 } | |
807 | |
808 for ( si = snps . begin( ); si != snps . end( ); ++ si ) | |
809 { | |
810 distR = (*si)->pR < (signed long)(*si)->ep->refnode->len - (*si)->pR + 1 ? | |
811 (*si)->pR : (*si)->ep->refnode->len - (*si)->pR + 1; | |
812 distQ = (*si)->pQ < (signed long)(*si)->ep->qrynode->len - (*si)->pQ + 1 ? | |
813 (*si)->pQ : (*si)->ep->qrynode->len - (*si)->pQ + 1; | |
814 dist = distR < distQ ? distR : distQ; | |
815 | |
816 printf ("%ld\t%c\t%c\t%ld\t", | |
817 (*si)->pR, (*si)->cR, (*si)->cQ, (*si)->pQ); | |
818 printf ("%ld\t%ld\t", (*si)->buff, dist); | |
819 if ( OPT_ShowConflict ) | |
820 printf ("%d\t%d\t", (*si)->conR, (*si)->conQ); | |
821 if ( OPT_ShowLength ) | |
822 printf ("%ld\t%ld\t", | |
823 (*si)->ep->refnode->len, (*si)->ep->qrynode->len); | |
824 if ( OPT_Context ) | |
825 printf ("%s\t%s\t", (*si)->ctxR.c_str( ), (*si)->ctxQ.c_str( )); | |
826 printf ("%d\t%d\t", (*si)->lp->frmR, (*si)->lp->frmQ); | |
827 printf ("%s\t%s", | |
828 (*si)->ep->refnode->id->c_str( ), | |
829 (*si)->ep->qrynode->id->c_str( )); | |
830 printf ("\n"); | |
831 } | |
832 } | |
833 | |
834 | |
835 | |
836 | |
837 //---------------------------------------------------------- SelectAligns ----// | |
838 void SelectAligns ( ) | |
839 { | |
840 string line, part; | |
841 string s1, e1, s2, e2, tR, tQ; | |
842 istringstream sin; | |
843 ostringstream sout; | |
844 | |
845 while ( cin ) | |
846 { | |
847 getline (cin, line); | |
848 if ( line . empty( ) ) | |
849 continue; | |
850 | |
851 sin . clear( ); | |
852 sin . str (line); | |
853 sin >> s1 >> e1 >> s2; | |
854 if ( s2 == "|" ) sin >> s2; | |
855 sin >> e2 >> tR >> tQ; | |
856 | |
857 if ( sin . fail( ) ) | |
858 { | |
859 cerr << "ERROR: Could not parse input from stdin\n"; | |
860 exit (EXIT_FAILURE); | |
861 } | |
862 | |
863 while ( sin >> part ) | |
864 { | |
865 tR = tQ; | |
866 tQ = part; | |
867 } | |
868 | |
869 sout . clear( ); | |
870 sout . str (""); | |
871 sout << s1 << ' ' << e1 << ' ' | |
872 << s2 << ' ' << e2 << ' ' | |
873 << tR << ' ' << tQ; | |
874 | |
875 OPT_Aligns . insert (sout . str( )); | |
876 } | |
877 } | |
878 | |
879 | |
880 | |
881 | |
882 //------------------------------------------------------------- ParseArgs ----// | |
883 void ParseArgs (int argc, char ** argv) | |
884 { | |
885 int ch, errflg = 0; | |
886 optarg = NULL; | |
887 | |
888 while ( !errflg && | |
889 ((ch = getopt (argc, argv, "ChHIlqrSTx:")) != EOF) ) | |
890 switch (ch) | |
891 { | |
892 case 'C': | |
893 OPT_ShowConflict = false; | |
894 break; | |
895 | |
896 case 'h': | |
897 PrintHelp (argv[0]); | |
898 exit (EXIT_SUCCESS); | |
899 break; | |
900 | |
901 case 'H': | |
902 OPT_PrintHeader = false; | |
903 break; | |
904 | |
905 case 'I': | |
906 OPT_ShowIndels = false; | |
907 break; | |
908 | |
909 case 'l': | |
910 OPT_ShowLength = true; | |
911 break; | |
912 | |
913 case 'q': | |
914 OPT_SortQuery = true; | |
915 break; | |
916 | |
917 case 'r': | |
918 OPT_SortReference = true; | |
919 break; | |
920 | |
921 case 'S': | |
922 OPT_SelectAligns = true; | |
923 break; | |
924 | |
925 case 'T': | |
926 OPT_PrintTabular = true; | |
927 break; | |
928 | |
929 case 'x': | |
930 OPT_Context = atoi (optarg); | |
931 break; | |
932 | |
933 default: | |
934 errflg ++; | |
935 } | |
936 | |
937 if ( OPT_Context < 0 ) | |
938 { | |
939 cerr << "ERROR: SNP context must be a positive int\n"; | |
940 errflg ++; | |
941 } | |
942 | |
943 if ( OPT_SortReference && OPT_SortQuery ) | |
944 cerr << "WARNING: both -r and -q were passed, -q ignored\n"; | |
945 | |
946 if ( !OPT_SortReference && !OPT_SortQuery ) | |
947 OPT_SortReference = true; | |
948 | |
949 if ( errflg > 0 || optind != argc - 1 ) | |
950 { | |
951 PrintUsage (argv[0]); | |
952 cerr << "Try '" << argv[0] << " -h' for more information.\n"; | |
953 exit (EXIT_FAILURE); | |
954 } | |
955 | |
956 OPT_AlignName = argv [optind ++]; | |
957 } | |
958 | |
959 | |
960 | |
961 | |
962 //------------------------------------------------------------- PrintHelp ----// | |
963 void PrintHelp (const char * s) | |
964 { | |
965 PrintUsage (s); | |
966 cerr | |
967 << "-C Do not report SNPs from alignments with an ambiguous\n" | |
968 << " mapping, i.e. only report SNPs where the [R] and [Q]\n" | |
969 << " columns equal 0 and do not output these columns\n" | |
970 << "-h Display help information\n" | |
971 << "-H Do not print the output header\n" | |
972 << "-I Do not report indels\n" | |
973 << "-l Include sequence length information in the output\n" | |
974 << "-q Sort output lines by query IDs and SNP positions\n" | |
975 << "-r Sort output lines by reference IDs and SNP positions\n" | |
976 << "-S Specify which alignments to report by passing\n" | |
977 << " 'show-coords' lines to stdin\n" | |
978 << "-T Switch to tab-delimited format\n" | |
979 << "-x int Include x characters of surrounding SNP context in the\n" | |
980 << " output, default " | |
981 << OPT_Context << endl | |
982 << endl; | |
983 | |
984 cerr | |
985 << " Input is the .delta output of either the nucmer or promer program\n" | |
986 << "passed on the command line.\n" | |
987 << " Output is to stdout, and consists of a list of SNPs (or amino acid\n" | |
988 << "substitutions for promer) with positions and other useful info.\n" | |
989 << "Output will be sorted with -r by default and the [BUFF] column will\n" | |
990 << "always refer to the sequence whose positions have been sorted. This\n" | |
991 << "value specifies the distance from this SNP to the nearest mismatch\n" | |
992 << "(end of alignment, indel, SNP, etc) in the same alignment, while the\n" | |
993 << "[DIST] column specifies the distance from this SNP to the nearest\n" | |
994 << "sequence end. SNPs for which the [R] and [Q] columns are greater than\n" | |
995 << "0 should be evaluated with caution, as these columns specify the\n" | |
996 << "number of other alignments which overlap this position. Use -C to\n" | |
997 << "assure SNPs are only reported from unique alignment regions.\n" | |
998 << endl; | |
999 | |
1000 return; | |
1001 } | |
1002 | |
1003 | |
1004 | |
1005 | |
1006 //------------------------------------------------------------ PrintUsage ----// | |
1007 void PrintUsage (const char * s) | |
1008 { | |
1009 cerr | |
1010 << "\nUSAGE: " << s << " [options] <deltafile>\n\n"; | |
1011 return; | |
1012 } |