comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/delta.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 ////////////////////////////////////////////////////////////////////////////////
2 //! \file
3 //! \author Adam M Phillippy
4 //! \date 03/26/2003
5 //!
6 //! \brief Source for non-inline member functions of delta.hh
7 //!
8 //! \see delta.hh
9 ////////////////////////////////////////////////////////////////////////////////
10
11 #include "delta.hh"
12 #include <map>
13 #include <vector>
14 #include <cmath>
15 #include <sstream>
16 #include <algorithm>
17 using namespace std;
18
19
20 inline long ScoreLocal
21 (long scorej,
22 long leni, long lenj,
23 long olap, float idyi, float maxolap);
24
25
26 struct LIS_t
27 //!< LIS score
28 {
29 DeltaEdgelet_t * a;
30 long score;
31 long diff;
32 long from;
33 bool used;
34 };
35
36
37 struct EdgeletQCmp_t
38 //!< Compares query lo coord
39 {
40 bool operator() (const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
41 {
42 //-- Sorting by score in the event of a tie ensures that when building
43 // LIS chains, the highest scoring ones get seen first, thus avoiding
44 // overlap problems
45
46 if ( i->loQ < j->loQ )
47 return true;
48 else if ( i->loQ > j->loQ )
49 return false;
50 else if ( ScoreLocal (0, i->hiQ - i->loQ + 1, 0, 0, i->idy, 0) >
51 ScoreLocal (0, j->hiQ - j->loQ + 1, 0, 0, j->idy, 0) )
52 return true;
53 else
54 return false;
55 }
56 };
57
58
59 struct EdgeletRCmp_t
60 //!< Compares reference lo coord
61 {
62 bool operator() (const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
63 {
64 //-- Sorting by score in the event of a tie ensures that when building
65 // LIS chains, the highest scoring ones get seen first, thus avoiding
66 // overlap problems
67
68 if ( i->loR < j->loR )
69 return true;
70 else if ( i->loR > j->loR )
71 return false;
72 else if ( ScoreLocal (0, i->hiR - i->loR + 1, 0, 0, i->idy, 0) >
73 ScoreLocal (0, j->hiR - j->loR + 1, 0, 0, j->idy, 0) )
74 return true;
75 else
76 return false;
77 }
78 };
79
80
81 struct NULLPred_t
82 //!< Return true if pointer is not NULL
83 {
84 bool operator() (const void * i) const
85 { return (i != NULL); }
86 };
87
88
89 //------------------------------------------------------------ DiffAligns ------
90 inline long DiffAligns
91 (const DeltaEdgelet_t * i, const DeltaEdgelet_t * j)
92 {
93 return
94 ( ( j->loR < i->loR ) ? labs (j->hiR - i->loR) : labs (i->hiR - j->loR) ) +
95 ( ( j->loQ < i->loQ ) ? labs (j->hiQ - i->loQ) : labs (i->hiQ - j->loQ) );
96 }
97
98
99 //------------------------------------------------------------ PickBest --------
100 long PickBest
101 (const LIS_t * lis, const vector<long> & allbest, float epsilon)
102 {
103 long size = allbest.size();
104 if ( epsilon < 0 && size != 0 )
105 {
106 long eqc = 0;
107 for ( ; eqc < size; ++ eqc )
108 if ( lis[allbest[eqc]].diff != lis[allbest.front()].diff )
109 break;
110
111 return (int)((double)eqc*rand() / (RAND_MAX + 1.0));
112 }
113 return size;
114 }
115
116
117 //------------------------------------------------------------------ RevC ------
118 inline long RevC (const long & coord,
119 const long & len)
120 {
121 return len - coord + 1;
122 }
123
124
125 //------------------------------------------------------------ ScoreLocal ------
126 inline long ScoreLocal
127 (long scorej, long leni, long lenj,
128 long olap, float idyi, float maxolap)
129 {
130 if ( olap > 0 &&
131 ((float)olap / (float)leni * 100.0 > maxolap ||
132 (float)olap / (float)lenj * 100.0 > maxolap) )
133 return -1;
134 else
135 return (scorej + (long)((leni - olap) * pow (idyi, 2)));
136 }
137
138
139 //----------------------------------------------------------- ScoreGlobal ------
140 inline long ScoreGlobal
141 (long scorej, long leni, long olap, float idyi)
142 {
143 return (scorej + (long)((leni - olap) * pow (idyi, 2)));
144 }
145
146
147 //------------------------------------------------------------------ Swap ------
148 inline void Swap (long & a, long & b)
149 {
150 long t = a; a = b; b = t;
151 }
152
153
154 //------------------------------------------------------------ UpdateBest ------
155 bool UpdateBest
156 (LIS_t * lis, long size, vector<long> & allbest, float epsilon)
157 {
158 if ( size == 0 )
159 return false;
160
161 long best, i;
162
163 //-- Find the best
164 for ( best = 0; best < size; ++ best )
165 if ( ! lis[best].used )
166 break;
167 for ( i = best + 1; i < size; ++ i )
168 if ( ! lis[i].used
169 &&
170 ( lis[i].score > lis[best].score
171 ||
172 (lis[i].score == lis[best].score &&
173 lis[i].diff < lis[best].diff) ) )
174 best = i;
175
176 //-- Nonequivalent
177 if ( ! allbest.empty()
178 &&
179 (best == size
180 ||
181 (epsilon < 0 &&
182 lis[best].score < lis[allbest.front()].score)
183 ||
184 (epsilon >= 0 &&
185 (float)(lis[allbest.front()].score - lis[best].score) /
186 (float)(lis[allbest.front()].score) * 100.0 > epsilon)) )
187 return false;
188
189 //-- Equivalent
190 allbest.push_back (best);
191
192 for ( i = best; i >= 0 && i < size; i = lis[i].from )
193 lis[i].used = true;
194
195 return true;
196 }
197
198
199 //===================================================== DeltaReader_t ==========
200 //----------------------------------------------------- open -------------------
201 void DeltaReader_t::open
202 (const string & delta_path)
203 {
204 delta_path_m = delta_path;
205
206 //-- Open the delta file
207 delta_stream_m.open (delta_path_m.c_str ());
208 checkStream ();
209
210 //-- Read the file header
211 delta_stream_m >> reference_path_m;
212 delta_stream_m >> query_path_m;
213 delta_stream_m >> data_type_m;
214 if ( (data_type_m != NUCMER_STRING && data_type_m != PROMER_STRING) )
215 delta_stream_m.setstate (ios::failbit);
216 checkStream ();
217 is_open_m = true;
218
219 //-- Advance to first record header
220 while ( delta_stream_m.peek () != '>' )
221 if ( delta_stream_m.get () == EOF )
222 break;
223 }
224
225
226 //----------------------------------------------------- readNextAlignment ------
227 void DeltaReader_t::readNextAlignment
228 (DeltaAlignment_t & align, const bool read_deltas)
229 {
230 long delta;
231 float total;
232
233 //-- Make way for the new alignment
234 align.clear ();
235
236 //-- Read the alignment header
237 delta_stream_m >> align.sR;
238 delta_stream_m >> align.eR;
239 delta_stream_m >> align.sQ;
240 delta_stream_m >> align.eQ;
241 delta_stream_m >> align.idyc;
242 delta_stream_m >> align.simc;
243 delta_stream_m >> align.stpc;
244 if ( align.sR <= 0 || align.eR <= 0 ||
245 align.sQ <= 0 || align.eQ <= 0 ||
246 align.idyc < 0 || align.simc < 0 || align.stpc < 0 )
247 delta_stream_m.setstate (ios::failbit);
248 checkStream ();
249
250 total = labs(align.eR - align.sR) + 1.0;
251 if ( data_type_m == PROMER_STRING )
252 total /= 3.0;
253
254 //-- Get all the deltas
255 do
256 {
257 delta_stream_m >> delta;
258 checkStream ();
259
260 if ( delta < 0 )
261 total ++;
262 if ( read_deltas )
263 align.deltas.push_back (delta);
264 } while ( delta != 0 );
265
266 //-- Flush the remaining whitespace
267 while ( delta_stream_m.get () != '\n' );
268
269 //-- Calculate the identity, similarity and stopity
270 align.idy = (total - (float)align.idyc) / total * 100.0;
271 align.sim = (total - (float)align.simc) / total * 100.0;
272 align.stp = (float)align.stpc / (total * 2.0) * 100.0;
273 }
274
275
276 //----------------------------------------------------- readNextRecord ---------
277 bool DeltaReader_t::readNextRecord (const bool read_deltas)
278 {
279 //-- If EOF or any other abnormality
280 if ( delta_stream_m.peek () != '>' )
281 return false;
282
283 //-- Make way for the new record
284 record_m.clear ();
285 is_record_m = true;
286
287 //-- Read the record header
288 delta_stream_m.get ();
289 delta_stream_m >> record_m.idR;
290 delta_stream_m >> record_m.idQ;
291 delta_stream_m >> record_m.lenR;
292 delta_stream_m >> record_m.lenQ;
293 if ( record_m.lenR <= 0 || record_m.lenQ <= 0 )
294 delta_stream_m.setstate (ios::failbit);
295 checkStream ();
296
297 //-- Flush the remaining whitespace
298 while ( delta_stream_m.get () != '\n' );
299
300 //-- For each alignment...
301 DeltaAlignment_t align;
302 while ( delta_stream_m.peek () != '>' &&
303 delta_stream_m.peek () != EOF )
304 {
305 readNextAlignment (align, read_deltas);
306 record_m.aligns.push_back (align);
307 }
308
309 return true;
310 }
311
312
313 //===================================================== DeltaEdge_t ============
314 //------------------------------------------------------build ------------------
315 void DeltaEdge_t::build (const DeltaRecord_t & rec)
316 {
317 stringstream ss;
318 vector<long>::const_iterator di;
319 DeltaEdgelet_t * p;
320
321 vector<DeltaAlignment_t>::const_iterator i;
322 for ( i = rec.aligns.begin(); i != rec.aligns.end(); ++ i )
323 {
324 //-- Set the edgelet
325 p = new DeltaEdgelet_t();
326
327 p->edge = this;
328
329 p->idy = i->idy / 100.0;
330 p->sim = i->sim / 100.0;
331 p->stp = i->stp / 100.0;
332
333 p->idyc = i->idyc;
334 p->simc = i->simc;
335 p->stpc = i->stpc;
336
337 p->loR = i->sR;
338 p->hiR = i->eR;
339 p->loQ = i->sQ;
340 p->hiQ = i->eQ;
341
342 p->dirR = p->hiR < p->loR ? REVERSE_DIR : FORWARD_DIR;
343 p->dirQ = p->hiQ < p->loQ ? REVERSE_DIR : FORWARD_DIR;
344
345 //-- Get the delta information
346 for ( di = i->deltas.begin(); di != i->deltas.end(); ++ di )
347 {
348 ss << *di << '\n';
349 p->delta.append (ss.str());
350 ss.str ("");
351 }
352
353 //-- Force loR < hiR && loQ < hiQ
354 if ( p->dirR == REVERSE_DIR )
355 Swap (p->loR, p->hiR);
356 if ( p->dirQ == REVERSE_DIR )
357 Swap (p->loQ, p->hiQ);
358
359 edgelets.push_back (p);
360 }
361 }
362
363
364 //===================================================== DeltaGraph_t ===========
365 //----------------------------------------------------- build ------------------
366 //! \brief Build a new graph from a deltafile
367 //!
368 //! Does not populate:
369 //! node->seq
370 //! edgelet->frmR/frmQ
371 //! edgelet->snps
372 //!
373 //! \param deltapath The path of the deltafile to read
374 //! \param getdeltas Read the delta-encoded gap positions? yes/no
375 //! \return void
376 //!
377 void DeltaGraph_t::build (const string & deltapath, bool getdeltas)
378 {
379 DeltaReader_t dr;
380 DeltaEdge_t * dep;
381 pair<map<string, DeltaNode_t>::iterator, bool> insret;
382
383
384 //-- Open the delta file and read in the alignment information
385 dr.open (deltapath);
386
387 refpath = dr.getReferencePath();
388 qrypath = dr.getQueryPath();
389
390 if ( dr.getDataType() == NUCMER_STRING )
391 datatype = NUCMER_DATA;
392 else if ( dr.getDataType() == PROMER_STRING )
393 datatype = PROMER_DATA;
394 else
395 datatype = NULL_DATA;
396
397 //-- Read in the next graph edge, i.e. a new delta record
398 while ( dr.readNext (getdeltas) )
399 {
400 dep = new DeltaEdge_t();
401
402
403 //-- Find the reference node in the graph, add a new one if necessary
404 insret = refnodes.insert
405 (map<string, DeltaNode_t>::value_type
406 (dr.getRecord().idR, DeltaNode_t()));
407 dep->refnode = &((insret.first)->second);
408
409 //-- If a new reference node
410 if ( insret.second )
411 {
412 dep->refnode->id = &((insret.first)->first);
413 dep->refnode->len = dr.getRecord().lenR;
414 }
415
416
417 //-- Find the query node in the graph, add a new one if necessary
418 insret = qrynodes.insert
419 (map<string, DeltaNode_t>::value_type
420 (dr.getRecord().idQ, DeltaNode_t()));
421 dep->qrynode = &((insret.first)->second);
422
423 //-- If a new query node
424 if ( insret.second )
425 {
426 dep->qrynode->id = &((insret.first)->first);
427 dep->qrynode->len = dr.getRecord().lenQ;
428 }
429
430
431 //-- Build the edge
432 dep->build (dr.getRecord());
433 dep->refnode->edges.push_back (dep);
434 dep->qrynode->edges.push_back (dep);
435 }
436 dr.close ();
437 }
438
439
440 //------------------------------------------------------------------- clean ----
441 //! \brief Clean the graph of all edgelets where isGOOD = false
442 //!
443 //! Removes all edgelets from the graph where isGOOD = false. Afterwhich, all
444 //! now empty edges or nodes are also removed.
445 //!
446 //! \return void
447 //!
448 void DeltaGraph_t::clean()
449 {
450 map<string, DeltaNode_t>::iterator i;
451 map<string, DeltaNode_t>::iterator ii;
452 vector<DeltaEdge_t *>::iterator j;
453 vector<DeltaEdgelet_t *>::iterator k;
454
455 //-- For all ref nodes
456 for ( i = refnodes.begin(); i != refnodes.end(); )
457 {
458 //-- For all edges
459 for ( j = i->second.edges.begin();
460 j != i->second.edges.end(); ++ j )
461 {
462 //-- For all edgelets
463 for ( k = (*j)->edgelets.begin();
464 k != (*j)->edgelets.end(); ++ k )
465 {
466 if ( ! (*k)->isGOOD )
467 {
468 //-- Delete the bad edgelet and mark for erasure
469 delete (*k);
470 *k = NULL;
471 }
472 }
473
474 //-- Erase the marked edgelets
475 k = partition ((*j)->edgelets.begin(),
476 (*j)->edgelets.end(),
477 NULLPred_t());
478 (*j)->edgelets.erase (k, (*j)->edgelets.end());
479
480 //-- Mark the edge if empty
481 if ( (*j)->edgelets.empty() )
482 *j = NULL;
483 }
484
485 //-- Erase the marked edges
486 j = partition (i->second.edges.begin(),
487 i->second.edges.end(),
488 NULLPred_t());
489 i->second.edges.erase (j, i->second.edges.end());
490
491 //-- Erase the node if empty
492 ii = i ++;
493 if ( ii->second.edges.empty() )
494 refnodes.erase (ii);
495 }
496
497 //-- For all qry nodes
498 for ( i = qrynodes.begin(); i != qrynodes.end(); )
499 {
500 for ( j = i->second.edges.begin();
501 j != i->second.edges.end(); ++ j )
502 {
503 //-- Delete the edge if empty and mark for erasure
504 if ( (*j)->edgelets.empty() )
505 {
506 delete (*j);
507 *j = NULL;
508 }
509 }
510
511 //-- Erase the marked edges
512 j = partition (i->second.edges.begin(),
513 i->second.edges.end(),
514 NULLPred_t());
515 i->second.edges.erase (j, i->second.edges.end());
516
517 //-- Erase the node if empty
518 ii = i ++;
519 if ( ii->second.edges.empty() )
520 qrynodes.erase (ii);
521 }
522
523 }
524
525
526 //------------------------------------------------------------------- clear ----
527 //! \brief Clear the graph of all nodes, edges and edgelets
528 //!
529 //! \return void
530 //!
531 void DeltaGraph_t::clear ( )
532 {
533 //-- Make sure the edges only get destructed once
534 std::map<std::string, DeltaNode_t>::iterator i;
535 std::vector<DeltaEdge_t *>::iterator j;
536 for ( i = refnodes . begin( ); i != refnodes . end( ); ++ i )
537 for ( j = i -> second . edges . begin( );
538 j != i -> second . edges . end( ); ++ j )
539 delete (*j);
540
541 refnodes.clear( );
542 qrynodes.clear( );
543 refpath.erase( );
544 qrypath.erase( );
545 datatype = NULL_DATA;
546 }
547
548
549 //------------------------------------------------------------ getNodeCount ----
550 //! \brief Counts and returns the number of graph nodes (sequences)
551 //!
552 //! \return The number of graph nodes
553 //!
554 long DeltaGraph_t::getNodeCount()
555 {
556 long sum = refnodes.size() + qrynodes.size();
557 return sum;
558 }
559
560
561 //------------------------------------------------------------ getEdgeCount ----
562 //! \brief Counts and returns the number of graph edges
563 //!
564 //! \return void
565 //!
566 long DeltaGraph_t::getEdgeCount()
567 {
568 long sum = 0;
569
570 map<string, DeltaNode_t>::iterator i;
571 for ( i = refnodes.begin(); i != refnodes.end(); ++ i )
572 sum += i->second.edges.size();
573
574 return sum;
575 }
576
577
578 //--------------------------------------------------------- getEdgeletCount ----
579 //! \brief Counts and returns the number of graph edgelets (alignments)
580 //!
581 //! \return void
582 //!
583 long DeltaGraph_t::getEdgeletCount()
584 {
585 long sum = 0;
586
587 map<string, DeltaNode_t>::iterator i;
588 vector<DeltaEdge_t *>::iterator j;
589 for ( i = refnodes.begin(); i != refnodes.end(); ++ i )
590 for ( j = i->second.edges.begin();
591 j != i->second.edges.end(); ++ j )
592 sum += (*j)->edgelets.size();
593
594 return sum;
595 }
596
597
598 //---------------------------------------------------------------- flagGOOD ----
599 //! \brief Sets isGOOD = 1 for all edgelets
600 void DeltaGraph_t::flagGOOD()
601 {
602 map<string, DeltaNode_t>::const_iterator mi;
603 vector<DeltaEdge_t *>::const_iterator ei;
604 vector<DeltaEdgelet_t *>::iterator eli;
605
606 //-- All references
607 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
608 {
609 //-- All queries matching this reference
610 for ( ei = (mi->second).edges.begin();
611 ei != (mi->second).edges.end(); ++ ei )
612 {
613 //-- All alignments between reference and query
614 for ( eli = (*ei)->edgelets.begin();
615 eli != (*ei)->edgelets.end(); ++ eli )
616 {
617 (*eli)->isGOOD = true;
618 }
619 }
620 }
621 }
622
623
624 //---------------------------------------------------------------- flag1to1 ----
625 //! \brief Intersection of flagQLIS and flagRLIS
626 void DeltaGraph_t::flag1to1(float epsilon, float maxolap)
627 {
628 flagRLIS(epsilon, maxolap, false);
629 flagQLIS(epsilon, maxolap, false);
630
631 map<string, DeltaNode_t>::const_iterator mi;
632 vector<DeltaEdge_t *>::const_iterator ei;
633 vector<DeltaEdgelet_t *>::iterator eli;
634
635 //-- All references
636 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
637 {
638 //-- All queries matching this reference
639 for ( ei = (mi->second).edges.begin();
640 ei != (mi->second).edges.end(); ++ ei )
641 {
642 //-- All alignments between reference and query
643 for ( eli = (*ei)->edgelets.begin();
644 eli != (*ei)->edgelets.end(); ++ eli )
645 {
646 if ( !(*eli)->isRLIS || !(*eli)->isQLIS )
647 (*eli)->isGOOD = false;
648 }
649 }
650 }
651 }
652
653
654 //---------------------------------------------------------------- flagMtoM ----
655 //! \brief Union of flagQLIS and flagRLIS
656 void DeltaGraph_t::flagMtoM(float epsilon, float maxolap)
657 {
658 flagRLIS(epsilon, maxolap, false);
659 flagQLIS(epsilon, maxolap, false);
660
661 map<string, DeltaNode_t>::const_iterator mi;
662 vector<DeltaEdge_t *>::const_iterator ei;
663 vector<DeltaEdgelet_t *>::iterator eli;
664
665 //-- All references
666 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
667 {
668 //-- All queries matching this reference
669 for ( ei = (mi->second).edges.begin();
670 ei != (mi->second).edges.end(); ++ ei )
671 {
672 //-- All alignments between reference and query
673 for ( eli = (*ei)->edgelets.begin();
674 eli != (*ei)->edgelets.end(); ++ eli )
675 {
676 if ( !(*eli)->isRLIS && !(*eli)->isQLIS )
677 (*eli)->isGOOD = false;
678 }
679 }
680 }
681 }
682
683
684 //-------------------------------------------------------------- flagGLIS ------
685 //! \brief Flag edgelets in the global LIS and unflag those who are not
686 //!
687 //! Runs a length*identity weigthed LIS to determine the longest, mutually
688 //! consistent set of alignments between all pairs of sequences. This
689 //! essentially constructs the global alignment between all sequence pairs.
690 //!
691 //! Sets isGLIS flag for good and unsets isGOOD flag for bad.
692 //!
693 //! \param epsilon Keep repeat alignments within epsilon % of the best align
694 //! \return void
695 //!
696 void DeltaGraph_t::flagGLIS (float epsilon)
697 {
698 LIS_t * lis = NULL;
699 long lis_size = 0;
700 long i, j, n;
701 long olap, olapQ, olapR, len, lenQ, lenR, score, diff;
702
703 vector<DeltaEdgelet_t *> edgelets;
704
705 map<string, DeltaNode_t>::const_iterator mi;
706 vector<DeltaEdge_t *>::const_iterator ei;
707 vector<DeltaEdgelet_t *>::iterator eli;
708
709
710 //-- For each reference sequence
711 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
712 {
713 //-- For each query aligning to this reference
714 for ( ei = (mi->second).edges.begin();
715 ei != (mi->second).edges.end(); ++ ei )
716 {
717 //-- Collect all the good edgelets
718 edgelets.clear();
719 for ( eli = (*ei)->edgelets.begin();
720 eli != (*ei)->edgelets.end(); ++ eli )
721 if ( (*eli)->isGOOD )
722 {
723 edgelets.push_back (*eli);
724
725 //-- Fix the coordinates to make global LIS work
726 if ( (*eli)->dirR == (*eli)->dirQ )
727 {
728 (*eli)->dirQ = FORWARD_DIR;
729 }
730 else
731 {
732 if ( (*eli)->dirQ == REVERSE_DIR )
733 Swap ((*eli)->loQ, (*eli)->hiQ);
734 (*eli)->loQ = RevC ((*eli)->loQ, (*ei)->qrynode->len);
735 (*eli)->hiQ = RevC ((*eli)->hiQ, (*ei)->qrynode->len);
736 (*eli)->dirQ = REVERSE_DIR;
737 }
738 }
739
740 //-- Resize and initialize
741 n = edgelets.size();
742 if ( n > lis_size )
743 {
744 lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n);
745 lis_size = n;
746 }
747 for ( i = 0; i < n; ++ i )
748 lis[i].used = false;
749
750 //-- Sort by lo query coord
751 sort (edgelets.begin(), edgelets.end(), EdgeletQCmp_t());
752
753 //-- Continue until all equivalent repeats are extracted
754 vector<long> allbest;
755 do
756 {
757 //-- Dynamic
758 for ( i = 0; i < n; ++ i )
759 {
760 if ( lis[i].used ) continue;
761
762 lis[i].a = edgelets[i];
763
764 lenR = lis[i].a->hiR - lis[i].a->loR + 1;
765 lenQ = lis[i].a->hiQ - lis[i].a->loQ + 1;
766 len = lenR > lenQ ? lenQ : lenR;
767 lis[i].score = ScoreGlobal (0, len, 0, lis[i].a->idy);
768
769 lis[i].from = -1;
770 lis[i].diff = 0;
771
772 for ( j = 0; j < i; ++ j )
773 {
774 if ( lis[j].used ) continue;
775
776 if ( lis[i].a->dirQ != lis[j].a->dirQ )
777 continue;
778
779 lenR = lis[i].a->hiR - lis[i].a->loR + 1;
780 lenQ = lis[i].a->hiQ - lis[i].a->loQ + 1;
781 len = lenR > lenQ ? lenQ : lenR;
782
783 olapR = lis[j].a->hiR - lis[i].a->loR + 1;
784 olapQ = lis[j].a->hiQ - lis[i].a->loQ + 1;
785 olap = olapR > olapQ ? olapR : olapQ;
786 if ( olap < 0 )
787 olap = 0;
788
789 diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a);
790
791 score = ScoreGlobal
792 (lis[j].score, len, olap, lis[i].a->idy);
793
794 if ( score > lis[i].score
795 ||
796 (score == lis[i].score && diff < lis[i].diff) )
797 {
798 lis[i].from = j;
799 lis[i].score = score;
800 lis[i].diff = diff;
801 }
802 }
803 }
804 } while ( UpdateBest (lis, n, allbest, epsilon) );
805
806 long beg = PickBest (lis, allbest, epsilon);
807 long end = allbest.size();
808 if ( beg == end ) beg = 0;
809 else end = beg + 1;
810
811 //-- Flag the edgelets
812 for ( ; beg < end; ++ beg )
813 for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from )
814 lis[i].a->isGLIS = true;
815
816 //-- Repair the coordinates
817 for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli )
818 {
819 if ( ! (*eli)->isGLIS )
820 (*eli)->isGOOD = false;
821
822 if ( (*eli)->dirQ == FORWARD_DIR )
823 {
824 (*eli)->dirQ = (*eli)->dirR;
825 }
826 else
827 {
828 if ( (*eli)->dirR == FORWARD_DIR )
829 Swap ((*eli)->loQ, (*eli)->hiQ);
830 (*eli)->loQ = RevC ((*eli)->loQ, (*ei)->qrynode->len);
831 (*eli)->hiQ = RevC ((*eli)->hiQ, (*ei)->qrynode->len);
832 (*eli)->dirQ =
833 (*eli)->dirR == FORWARD_DIR ? REVERSE_DIR : FORWARD_DIR;
834 }
835 }
836 }
837 }
838
839 free (lis);
840 }
841
842
843 //-------------------------------------------------------------- flagQLIS ------
844 //! \brief Flag edgelets in the query LIS and unflag those who are not
845 //!
846 //! Runs a length*identity weighted LIS to determine the longest, QUERY
847 //! consistent set of alignments for all query sequences. This effectively
848 //! identifies the "best" alignments for all positions of each query.
849 //!
850 //! Sets isQLIS flag for good and unsets isGOOD flag for bad.
851 //!
852 //! \param epsilon Keep repeat alignments within epsilon % of the best align
853 //! \param maxolap Only allow alignments to overlap by maxolap percent [0-100]
854 //! \param flagBad Flag non QLIS edgelets as !isGOOD
855 //! \return void
856 //!
857 void DeltaGraph_t::flagQLIS (float epsilon, float maxolap, bool flagbad)
858 {
859 LIS_t * lis = NULL;
860 long lis_size = 0;
861 long i, j, n;
862 long olap, leni, lenj, score, diff;
863
864 vector<DeltaEdgelet_t *> edgelets;
865
866 map<string, DeltaNode_t>::const_iterator mi;
867 vector<DeltaEdge_t *>::const_iterator ei;
868 vector<DeltaEdgelet_t *>::iterator eli;
869
870
871 //-- For each query sequence
872 for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi )
873 {
874 //-- Collect all the good edgelets
875 edgelets.clear();
876 for ( ei = (mi->second).edges.begin();
877 ei != (mi->second).edges.end(); ++ ei )
878 for ( eli = (*ei)->edgelets.begin();
879 eli != (*ei)->edgelets.end(); ++ eli )
880 if ( (*eli)->isGOOD )
881 edgelets.push_back (*eli);
882
883 //-- Resize and initialize
884 n = edgelets.size();
885 if ( n > lis_size )
886 {
887 lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n);
888 lis_size = n;
889 }
890 for ( i = 0; i < n; ++ i )
891 lis[i].used = false;
892
893 //-- Sort by lo query coord
894 sort (edgelets.begin(), edgelets.end(), EdgeletQCmp_t());
895
896 //-- Continue until all equivalent repeats are extracted
897 vector<long> allbest;
898 do
899 {
900 //-- Dynamic
901 for ( i = 0; i < n; ++ i )
902 {
903 if ( lis[i].used ) continue;
904
905 lis[i].a = edgelets[i];
906
907 leni = lis[i].a->hiQ - lis[i].a->loQ + 1;
908 lis[i].score =
909 ScoreLocal (0, leni, 0, 0, lis[i].a->idy, 0);
910
911 lis[i].from = -1;
912 lis[i].diff = 0;
913
914 for ( j = 0; j < i; ++ j )
915 {
916 if ( lis[j].used ) continue;
917 if ( lis[j].from >= 0 &&
918 lis[lis[j].from].a->hiQ >= lis[i].a->loQ )
919 continue;
920
921 leni = lis[i].a->hiQ - lis[i].a->loQ + 1;
922 lenj = lis[j].a->hiQ - lis[j].a->loQ + 1;
923 olap = lis[j].a->hiQ - lis[i].a->loQ + 1;
924 if ( olap < 0 )
925 olap = 0;
926
927 diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a);
928
929 score = ScoreLocal
930 (lis[j].score, leni, lenj, olap, lis[i].a->idy, maxolap);
931
932 if ( score > lis[i].score
933 ||
934 (score == lis[i].score && diff < lis[i].diff) )
935 {
936 lis[i].from = j;
937 lis[i].score = score;
938 lis[i].diff = diff;
939 }
940 }
941 }
942 } while ( UpdateBest (lis, n, allbest, epsilon) );
943
944 long beg = PickBest (lis, allbest, epsilon);
945 long end = allbest.size();
946 if ( beg == end ) beg = 0;
947 else end = beg + 1;
948
949 //-- Flag the edgelets
950 for ( ; beg < end; ++ beg )
951 for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from )
952 lis[i].a->isQLIS = true;
953
954 if ( flagbad )
955 for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli )
956 if ( ! (*eli)->isQLIS )
957 (*eli)->isGOOD = false;
958 }
959
960 free (lis);
961 }
962
963
964 //-------------------------------------------------------------- flagRLIS ------
965 //! \brief Flag edgelets in the reference LIS and unflag those who are not
966 //!
967 //! Runs a length*identity weighted LIS to determine the longest, REFERENCE
968 //! consistent set of alignments for all reference sequences. This effectively
969 //! identifies the "best" alignments for all positions of each reference.
970 //!
971 //! Sets isRLIS flag for good and unsets isGOOD flag for bad.
972 //!
973 //! \param epsilon Keep repeat alignments within epsilon % of the best align
974 //! \param maxolap Only allow alignments to overlap by maxolap percent [0-100]
975 //! \param flagBad Flag non RLIS edgelets as !isGOOD
976 //! \return void
977 //!
978 void DeltaGraph_t::flagRLIS (float epsilon, float maxolap, bool flagbad)
979 {
980 LIS_t * lis = NULL;
981 long lis_size = 0;
982 long i, j, n;
983 long olap, leni, lenj, score, diff;
984
985 vector<DeltaEdgelet_t *> edgelets;
986
987 map<string, DeltaNode_t>::const_iterator mi;
988 vector<DeltaEdge_t *>::const_iterator ei;
989 vector<DeltaEdgelet_t *>::iterator eli;
990
991
992 //-- For each reference sequence
993 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
994 {
995 //-- Collect all the good edgelets
996 edgelets.clear();
997 for ( ei = (mi->second).edges.begin();
998 ei != (mi->second).edges.end(); ++ ei )
999 for ( eli = (*ei)->edgelets.begin();
1000 eli != (*ei)->edgelets.end(); ++ eli )
1001 if ( (*eli)->isGOOD )
1002 edgelets.push_back (*eli);
1003
1004 //-- Resize
1005 n = edgelets.size();
1006 if ( n > lis_size )
1007 {
1008 lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n);
1009 lis_size = n;
1010 }
1011 for ( i = 0; i < n; ++ i )
1012 lis[i].used = false;
1013
1014 //-- Sort by lo reference coord
1015 sort (edgelets.begin(), edgelets.end(), EdgeletRCmp_t());
1016
1017 //-- Continue until all equivalent repeats are extracted
1018 vector<long> allbest;
1019 do
1020 {
1021 //-- Dynamic
1022 for ( i = 0; i < n; ++ i )
1023 {
1024 if ( lis[i].used ) continue;
1025
1026 lis[i].a = edgelets[i];
1027
1028 leni = lis[i].a->hiR - lis[i].a->loR + 1;
1029 lis[i].score =
1030 ScoreLocal (0, leni, 0, 0, lis[i].a->idy, 0);
1031
1032 lis[i].from = -1;
1033 lis[i].diff = 0;
1034
1035 for ( j = 0; j < i; ++ j )
1036 {
1037 if ( lis[j].used ) continue;
1038 if ( lis[j].from >= 0 &&
1039 lis[lis[j].from].a->hiR >= lis[i].a->loR )
1040 continue;
1041
1042 leni = lis[i].a->hiR - lis[i].a->loR + 1;
1043 lenj = lis[j].a->hiR - lis[j].a->loR + 1;
1044 olap = lis[j].a->hiR - lis[i].a->loR + 1;
1045 if ( olap < 0 )
1046 olap = 0;
1047
1048 diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a);
1049
1050 score = ScoreLocal
1051 (lis[j].score, leni, lenj, olap, lis[i].a->idy, maxolap);
1052
1053 if ( score > lis[i].score
1054 ||
1055 (score == lis[i].score && diff < lis[i].diff) )
1056 {
1057 lis[i].from = j;
1058 lis[i].score = score;
1059 lis[i].diff = diff;
1060 }
1061 }
1062 }
1063 } while ( UpdateBest (lis, n, allbest, epsilon) );
1064
1065 long beg = PickBest (lis, allbest, epsilon);
1066 long end = allbest.size();
1067 if ( beg == end ) beg = 0;
1068 else end = beg + 1;
1069
1070 //-- Flag the edgelets
1071 for ( ; beg < end; ++ beg )
1072 for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from )
1073 lis[i].a->isRLIS = true;
1074
1075 if ( flagbad )
1076 for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli )
1077 if ( ! (*eli)->isRLIS )
1078 (*eli)->isGOOD = false;
1079 }
1080
1081 free (lis);
1082 }
1083
1084
1085 //------------------------------------------------------------- flagScore ------
1086 //! \brief Flag edgelets with scores below a score threshold
1087 //!
1088 //! Unsets isGOOD for bad.
1089 //!
1090 //! \param minlen Flag edgelets if less than minlen in length
1091 //! \param minidy Flag edgelets if less than minidy identity [0-100]
1092 //! \return void
1093 //!
1094 void DeltaGraph_t::flagScore (long minlen, float minidy)
1095 {
1096 map<string, DeltaNode_t>::const_iterator mi;
1097 vector<DeltaEdge_t *>::const_iterator ei;
1098 vector<DeltaEdgelet_t *>::iterator eli;
1099
1100 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
1101 for ( ei = (mi->second).edges.begin();
1102 ei != (mi->second).edges.end(); ++ ei )
1103 for ( eli = (*ei)->edgelets.begin();
1104 eli != (*ei)->edgelets.end(); ++ eli )
1105 if ( (*eli)->isGOOD )
1106 {
1107 //-- Flag low identities
1108 if ( (*eli)->idy * 100.0 < minidy )
1109 (*eli)->isGOOD = false;
1110
1111 //-- Flag small lengths
1112 if ( (*eli)->hiR - (*eli)->loR + 1 < minlen ||
1113 (*eli)->hiQ - (*eli)->loQ + 1 < minlen )
1114 (*eli)->isGOOD = false;
1115 }
1116 }
1117
1118
1119 //-------------------------------------------------------------- flagUNIQ ------
1120 //! \brief Flag edgelets with uniqueness below a certain threshold
1121 //!
1122 //! Unsets isGOOD for bad.
1123 //!
1124 //! \param minuniq Flag edgelets if less that minuniq percent [0-100] unique
1125 //! \return void
1126 //!
1127 void DeltaGraph_t::flagUNIQ (float minuniq)
1128 {
1129 long i, uniq, len;
1130
1131 vector<DeltaEdgelet_t *> edgelets;
1132
1133 map<string, DeltaNode_t>::const_iterator mi;
1134 vector<DeltaEdge_t *>::const_iterator ei;
1135 vector<DeltaEdgelet_t *>::iterator eli;
1136
1137
1138 //-- For each reference sequence
1139 long ref_size = 0;
1140 long ref_len = 0;
1141 unsigned char * ref_cov = NULL;
1142 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
1143 {
1144 //-- Reset the reference coverage array
1145 ref_len = (mi->second).len;
1146 if ( ref_len > ref_size )
1147 {
1148 ref_cov = (unsigned char *) Safe_realloc (ref_cov, ref_len + 1);
1149 ref_size = ref_len;
1150 }
1151 for ( i = 1; i <= ref_len; ++ i )
1152 ref_cov[i] = 0;
1153
1154 //-- Collect all the good edgelets
1155 edgelets.clear();
1156 for ( ei = (mi->second).edges.begin();
1157 ei != (mi->second).edges.end(); ++ ei )
1158 for ( eli = (*ei)->edgelets.begin();
1159 eli != (*ei)->edgelets.end(); ++ eli )
1160 if ( (*eli)->isGOOD )
1161 {
1162 edgelets.push_back (*eli);
1163
1164 //-- Add to the reference coverage
1165 for ( i = (*eli)->loR; i <= (*eli)->hiR; i ++ )
1166 if ( ref_cov[i] < UCHAR_MAX )
1167 ref_cov[i] ++;
1168 }
1169
1170 //-- Calculate the uniqueness of each edgelet
1171 for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli )
1172 {
1173 uniq = 0;
1174 len = (*eli)->hiR - (*eli)->loR + 1;
1175 for ( i = (*eli)->loR; i <= (*eli)->hiR; i ++ )
1176 if ( ref_cov[i] == 1 )
1177 uniq ++;
1178
1179 //-- Flag low reference uniqueness
1180 if ( (float)uniq / (float)len * 100.0 < minuniq )
1181 (*eli)->isGOOD = false;
1182 }
1183 }
1184 free (ref_cov);
1185
1186
1187 //-- For each query sequence
1188 long qry_size = 0;
1189 long qry_len = 0;
1190 unsigned char * qry_cov = NULL;
1191 for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi )
1192 {
1193 //-- Reset the query coverage array
1194 qry_len = (mi->second).len;
1195 if ( qry_len > qry_size )
1196 {
1197 qry_cov = (unsigned char *) Safe_realloc (qry_cov, qry_len + 1);
1198 qry_size = qry_len;
1199 }
1200 for ( i = 1; i <= qry_len; ++ i )
1201 qry_cov[i] = 0;
1202
1203 //-- Collect all the good edgelets
1204 edgelets.clear();
1205 for ( ei = (mi->second).edges.begin();
1206 ei != (mi->second).edges.end(); ++ ei )
1207 for ( eli = (*ei)->edgelets.begin();
1208 eli != (*ei)->edgelets.end(); ++ eli )
1209 if ( (*eli)->isGOOD )
1210 {
1211 edgelets.push_back (*eli);
1212
1213 //-- Add to the query coverage
1214 for ( i = (*eli)->loQ; i <= (*eli)->hiQ; i ++ )
1215 if ( qry_cov[i] < UCHAR_MAX )
1216 qry_cov[i] ++;
1217 }
1218
1219 //-- Calculate the uniqueness of each edgelet
1220 for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli )
1221 {
1222 uniq = 0;
1223 len = (*eli)->hiQ - (*eli)->loQ + 1;
1224 for ( i = (*eli)->loQ; i <= (*eli)->hiQ; i ++ )
1225 if ( qry_cov[i] == 1 )
1226 uniq ++;
1227
1228 //-- Flag low query uniqueness
1229 if ( (float)uniq / (float)len * 100.0 < minuniq )
1230 (*eli)->isGOOD = false;
1231 }
1232 }
1233 free (qry_cov);
1234 }
1235
1236
1237 //----------------------------------------------------- loadSequences ----------
1238 //! \brief Load the sequence information into the DeltaNodes
1239 //!
1240 //! \return void
1241 //!
1242 void DeltaGraph_t::loadSequences ()
1243 {
1244 map<string, DeltaNode_t>::iterator mi;
1245
1246 FILE * qryfile, * reffile;
1247 char * R = NULL;
1248 char * Q = NULL;
1249 char id [MAX_LINE];
1250 long initsize;
1251 long len;
1252
1253 //-- Read in the reference sequences
1254 reffile = File_Open (refpath.c_str(), "r");
1255 initsize = INIT_SIZE;
1256 R = (char *) Safe_malloc (initsize);
1257 while ( Read_String (reffile, R, initsize, id, FALSE) )
1258 if ( (mi = refnodes.find (id)) != refnodes.end() )
1259 {
1260 len = strlen (R + 1);
1261 mi->second.seq = (char *) Safe_malloc (len + 2);
1262 mi->second.seq[0] = '\0';
1263 strcpy (mi->second.seq + 1, R + 1);
1264 if ( len != mi->second.len )
1265 {
1266 cerr << "ERROR: Reference input does not match delta file\n";
1267 exit (EXIT_FAILURE);
1268 }
1269 }
1270 fclose (reffile);
1271 free (R);
1272
1273 //-- Read in the query sequences
1274 qryfile = File_Open (qrypath.c_str(), "r");
1275 initsize = INIT_SIZE;
1276 Q = (char *) Safe_malloc (initsize);
1277 while ( Read_String (qryfile, Q, initsize, id, FALSE) )
1278 if ( (mi = qrynodes.find (id)) != qrynodes.end() )
1279 {
1280 len = strlen (Q + 1);
1281 mi->second.seq = (char *) Safe_malloc (len + 2);
1282 mi->second.seq[0] = '\0';
1283 strcpy (mi->second.seq + 1, Q + 1);
1284 if ( len != mi->second.len )
1285 {
1286 cerr << "ERROR: Query input does not match delta file\n";
1287 exit (EXIT_FAILURE);
1288 }
1289 }
1290 fclose (qryfile);
1291 free (Q);
1292
1293
1294 //-- Check that we found all the sequences
1295 for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi )
1296 if ( mi->second.seq == NULL )
1297 {
1298 cerr << "ERROR: '" << mi->first << "' not found in reference file\n";
1299 exit (EXIT_FAILURE);
1300 }
1301
1302 for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi )
1303 if ( mi->second.seq == NULL )
1304 {
1305 cerr << "ERROR: '" << mi->first << "' not found in query file\n";
1306 exit (EXIT_FAILURE);
1307 }
1308 }
1309
1310
1311 //----------------------------------------------------- outputDelta ------------
1312 //! \brief Outputs the contents of the graph as a deltafile
1313 //!
1314 //! \param out The output stream to write to
1315 //! \return The output stream
1316 //!
1317 ostream & DeltaGraph_t::outputDelta (ostream & out)
1318 {
1319 bool header;
1320 long s1, e1, s2, e2;
1321
1322 map<string, DeltaNode_t>::const_iterator mi;
1323 vector<DeltaEdge_t *>::const_iterator ei;
1324 vector<DeltaEdgelet_t *>::const_iterator eli;
1325
1326 //-- Print the file header
1327 cout
1328 << refpath << ' ' << qrypath << '\n'
1329 << (datatype == PROMER_DATA ? PROMER_STRING : NUCMER_STRING) << '\n';
1330
1331 for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi )
1332 {
1333 for ( ei = (mi->second).edges.begin();
1334 ei != (mi->second).edges.end(); ++ ei )
1335 {
1336 header = false;
1337
1338 for ( eli = (*ei)->edgelets.begin();
1339 eli != (*ei)->edgelets.end(); ++ eli )
1340 {
1341 if ( ! (*eli)->isGOOD )
1342 continue;
1343
1344 //-- Print the sequence header
1345 if ( ! header )
1346 {
1347 cout
1348 << '>'
1349 << *((*ei)->refnode->id) << ' '
1350 << *((*ei)->qrynode->id) << ' '
1351 << (*ei)->refnode->len << ' '
1352 << (*ei)->qrynode->len << '\n';
1353 header = true;
1354 }
1355 //-- Print the alignment
1356 s1 = (*eli)->loR;
1357 e1 = (*eli)->hiR;
1358 s2 = (*eli)->loQ;
1359 e2 = (*eli)->hiQ;
1360 if ( (*eli)->dirR == REVERSE_DIR )
1361 Swap (s1, e1);
1362 if ( (*eli)->dirQ == REVERSE_DIR )
1363 Swap (s2, e2);
1364
1365 cout
1366 << s1 << ' ' << e1 << ' ' << s2 << ' ' << e2 << ' '
1367 << (*eli)->idyc << ' '
1368 << (*eli)->simc << ' '
1369 << (*eli)->stpc << '\n'
1370 << (*eli)->delta;
1371 }
1372 }
1373 }
1374 return out;
1375 }