Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/delta.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/delta.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,1375 @@ +//////////////////////////////////////////////////////////////////////////////// +//! \file +//! \author Adam M Phillippy +//! \date 03/26/2003 +//! +//! \brief Source for non-inline member functions of delta.hh +//! +//! \see delta.hh +//////////////////////////////////////////////////////////////////////////////// + +#include "delta.hh" +#include <map> +#include <vector> +#include <cmath> +#include <sstream> +#include <algorithm> +using namespace std; + + +inline long ScoreLocal +(long scorej, + long leni, long lenj, + long olap, float idyi, float maxolap); + + +struct LIS_t +//!< LIS score +{ + DeltaEdgelet_t * a; + long score; + long diff; + long from; + bool used; +}; + + +struct EdgeletQCmp_t +//!< Compares query lo coord +{ + bool operator() (const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const + { + //-- Sorting by score in the event of a tie ensures that when building + // LIS chains, the highest scoring ones get seen first, thus avoiding + // overlap problems + + if ( i->loQ < j->loQ ) + return true; + else if ( i->loQ > j->loQ ) + return false; + else if ( ScoreLocal (0, i->hiQ - i->loQ + 1, 0, 0, i->idy, 0) > + ScoreLocal (0, j->hiQ - j->loQ + 1, 0, 0, j->idy, 0) ) + return true; + else + return false; + } +}; + + +struct EdgeletRCmp_t +//!< Compares reference lo coord +{ + bool operator() (const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const + { + //-- Sorting by score in the event of a tie ensures that when building + // LIS chains, the highest scoring ones get seen first, thus avoiding + // overlap problems + + if ( i->loR < j->loR ) + return true; + else if ( i->loR > j->loR ) + return false; + else if ( ScoreLocal (0, i->hiR - i->loR + 1, 0, 0, i->idy, 0) > + ScoreLocal (0, j->hiR - j->loR + 1, 0, 0, j->idy, 0) ) + return true; + else + return false; + } +}; + + +struct NULLPred_t +//!< Return true if pointer is not NULL +{ + bool operator() (const void * i) const + { return (i != NULL); } +}; + + +//------------------------------------------------------------ DiffAligns ------ +inline long DiffAligns +(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) +{ + return + ( ( j->loR < i->loR ) ? labs (j->hiR - i->loR) : labs (i->hiR - j->loR) ) + + ( ( j->loQ < i->loQ ) ? labs (j->hiQ - i->loQ) : labs (i->hiQ - j->loQ) ); +} + + +//------------------------------------------------------------ PickBest -------- +long PickBest +(const LIS_t * lis, const vector<long> & allbest, float epsilon) +{ + long size = allbest.size(); + if ( epsilon < 0 && size != 0 ) + { + long eqc = 0; + for ( ; eqc < size; ++ eqc ) + if ( lis[allbest[eqc]].diff != lis[allbest.front()].diff ) + break; + + return (int)((double)eqc*rand() / (RAND_MAX + 1.0)); + } + return size; +} + + +//------------------------------------------------------------------ RevC ------ +inline long RevC (const long & coord, + const long & len) +{ + return len - coord + 1; +} + + +//------------------------------------------------------------ ScoreLocal ------ +inline long ScoreLocal +(long scorej, long leni, long lenj, + long olap, float idyi, float maxolap) +{ + if ( olap > 0 && + ((float)olap / (float)leni * 100.0 > maxolap || + (float)olap / (float)lenj * 100.0 > maxolap) ) + return -1; + else + return (scorej + (long)((leni - olap) * pow (idyi, 2))); +} + + +//----------------------------------------------------------- ScoreGlobal ------ +inline long ScoreGlobal +(long scorej, long leni, long olap, float idyi) +{ + return (scorej + (long)((leni - olap) * pow (idyi, 2))); +} + + +//------------------------------------------------------------------ Swap ------ +inline void Swap (long & a, long & b) +{ + long t = a; a = b; b = t; +} + + +//------------------------------------------------------------ UpdateBest ------ +bool UpdateBest +(LIS_t * lis, long size, vector<long> & allbest, float epsilon) +{ + if ( size == 0 ) + return false; + + long best, i; + + //-- Find the best + for ( best = 0; best < size; ++ best ) + if ( ! lis[best].used ) + break; + for ( i = best + 1; i < size; ++ i ) + if ( ! lis[i].used + && + ( lis[i].score > lis[best].score + || + (lis[i].score == lis[best].score && + lis[i].diff < lis[best].diff) ) ) + best = i; + + //-- Nonequivalent + if ( ! allbest.empty() + && + (best == size + || + (epsilon < 0 && + lis[best].score < lis[allbest.front()].score) + || + (epsilon >= 0 && + (float)(lis[allbest.front()].score - lis[best].score) / + (float)(lis[allbest.front()].score) * 100.0 > epsilon)) ) + return false; + + //-- Equivalent + allbest.push_back (best); + + for ( i = best; i >= 0 && i < size; i = lis[i].from ) + lis[i].used = true; + + return true; +} + + +//===================================================== DeltaReader_t ========== +//----------------------------------------------------- open ------------------- +void DeltaReader_t::open +(const string & delta_path) +{ + delta_path_m = delta_path; + + //-- Open the delta file + delta_stream_m.open (delta_path_m.c_str ()); + checkStream (); + + //-- Read the file header + delta_stream_m >> reference_path_m; + delta_stream_m >> query_path_m; + delta_stream_m >> data_type_m; + if ( (data_type_m != NUCMER_STRING && data_type_m != PROMER_STRING) ) + delta_stream_m.setstate (ios::failbit); + checkStream (); + is_open_m = true; + + //-- Advance to first record header + while ( delta_stream_m.peek () != '>' ) + if ( delta_stream_m.get () == EOF ) + break; +} + + +//----------------------------------------------------- readNextAlignment ------ +void DeltaReader_t::readNextAlignment +(DeltaAlignment_t & align, const bool read_deltas) +{ + long delta; + float total; + + //-- Make way for the new alignment + align.clear (); + + //-- Read the alignment header + delta_stream_m >> align.sR; + delta_stream_m >> align.eR; + delta_stream_m >> align.sQ; + delta_stream_m >> align.eQ; + delta_stream_m >> align.idyc; + delta_stream_m >> align.simc; + delta_stream_m >> align.stpc; + if ( align.sR <= 0 || align.eR <= 0 || + align.sQ <= 0 || align.eQ <= 0 || + align.idyc < 0 || align.simc < 0 || align.stpc < 0 ) + delta_stream_m.setstate (ios::failbit); + checkStream (); + + total = labs(align.eR - align.sR) + 1.0; + if ( data_type_m == PROMER_STRING ) + total /= 3.0; + + //-- Get all the deltas + do + { + delta_stream_m >> delta; + checkStream (); + + if ( delta < 0 ) + total ++; + if ( read_deltas ) + align.deltas.push_back (delta); + } while ( delta != 0 ); + + //-- Flush the remaining whitespace + while ( delta_stream_m.get () != '\n' ); + + //-- Calculate the identity, similarity and stopity + align.idy = (total - (float)align.idyc) / total * 100.0; + align.sim = (total - (float)align.simc) / total * 100.0; + align.stp = (float)align.stpc / (total * 2.0) * 100.0; +} + + +//----------------------------------------------------- readNextRecord --------- +bool DeltaReader_t::readNextRecord (const bool read_deltas) +{ + //-- If EOF or any other abnormality + if ( delta_stream_m.peek () != '>' ) + return false; + + //-- Make way for the new record + record_m.clear (); + is_record_m = true; + + //-- Read the record header + delta_stream_m.get (); + delta_stream_m >> record_m.idR; + delta_stream_m >> record_m.idQ; + delta_stream_m >> record_m.lenR; + delta_stream_m >> record_m.lenQ; + if ( record_m.lenR <= 0 || record_m.lenQ <= 0 ) + delta_stream_m.setstate (ios::failbit); + checkStream (); + + //-- Flush the remaining whitespace + while ( delta_stream_m.get () != '\n' ); + + //-- For each alignment... + DeltaAlignment_t align; + while ( delta_stream_m.peek () != '>' && + delta_stream_m.peek () != EOF ) + { + readNextAlignment (align, read_deltas); + record_m.aligns.push_back (align); + } + + return true; +} + + +//===================================================== DeltaEdge_t ============ +//------------------------------------------------------build ------------------ +void DeltaEdge_t::build (const DeltaRecord_t & rec) +{ + stringstream ss; + vector<long>::const_iterator di; + DeltaEdgelet_t * p; + + vector<DeltaAlignment_t>::const_iterator i; + for ( i = rec.aligns.begin(); i != rec.aligns.end(); ++ i ) + { + //-- Set the edgelet + p = new DeltaEdgelet_t(); + + p->edge = this; + + p->idy = i->idy / 100.0; + p->sim = i->sim / 100.0; + p->stp = i->stp / 100.0; + + p->idyc = i->idyc; + p->simc = i->simc; + p->stpc = i->stpc; + + p->loR = i->sR; + p->hiR = i->eR; + p->loQ = i->sQ; + p->hiQ = i->eQ; + + p->dirR = p->hiR < p->loR ? REVERSE_DIR : FORWARD_DIR; + p->dirQ = p->hiQ < p->loQ ? REVERSE_DIR : FORWARD_DIR; + + //-- Get the delta information + for ( di = i->deltas.begin(); di != i->deltas.end(); ++ di ) + { + ss << *di << '\n'; + p->delta.append (ss.str()); + ss.str (""); + } + + //-- Force loR < hiR && loQ < hiQ + if ( p->dirR == REVERSE_DIR ) + Swap (p->loR, p->hiR); + if ( p->dirQ == REVERSE_DIR ) + Swap (p->loQ, p->hiQ); + + edgelets.push_back (p); + } +} + + +//===================================================== DeltaGraph_t =========== +//----------------------------------------------------- build ------------------ +//! \brief Build a new graph from a deltafile +//! +//! Does not populate: +//! node->seq +//! edgelet->frmR/frmQ +//! edgelet->snps +//! +//! \param deltapath The path of the deltafile to read +//! \param getdeltas Read the delta-encoded gap positions? yes/no +//! \return void +//! +void DeltaGraph_t::build (const string & deltapath, bool getdeltas) +{ + DeltaReader_t dr; + DeltaEdge_t * dep; + pair<map<string, DeltaNode_t>::iterator, bool> insret; + + + //-- Open the delta file and read in the alignment information + dr.open (deltapath); + + refpath = dr.getReferencePath(); + qrypath = dr.getQueryPath(); + + if ( dr.getDataType() == NUCMER_STRING ) + datatype = NUCMER_DATA; + else if ( dr.getDataType() == PROMER_STRING ) + datatype = PROMER_DATA; + else + datatype = NULL_DATA; + + //-- Read in the next graph edge, i.e. a new delta record + while ( dr.readNext (getdeltas) ) + { + dep = new DeltaEdge_t(); + + + //-- Find the reference node in the graph, add a new one if necessary + insret = refnodes.insert + (map<string, DeltaNode_t>::value_type + (dr.getRecord().idR, DeltaNode_t())); + dep->refnode = &((insret.first)->second); + + //-- If a new reference node + if ( insret.second ) + { + dep->refnode->id = &((insret.first)->first); + dep->refnode->len = dr.getRecord().lenR; + } + + + //-- Find the query node in the graph, add a new one if necessary + insret = qrynodes.insert + (map<string, DeltaNode_t>::value_type + (dr.getRecord().idQ, DeltaNode_t())); + dep->qrynode = &((insret.first)->second); + + //-- If a new query node + if ( insret.second ) + { + dep->qrynode->id = &((insret.first)->first); + dep->qrynode->len = dr.getRecord().lenQ; + } + + + //-- Build the edge + dep->build (dr.getRecord()); + dep->refnode->edges.push_back (dep); + dep->qrynode->edges.push_back (dep); + } + dr.close (); +} + + +//------------------------------------------------------------------- clean ---- +//! \brief Clean the graph of all edgelets where isGOOD = false +//! +//! Removes all edgelets from the graph where isGOOD = false. Afterwhich, all +//! now empty edges or nodes are also removed. +//! +//! \return void +//! +void DeltaGraph_t::clean() +{ + map<string, DeltaNode_t>::iterator i; + map<string, DeltaNode_t>::iterator ii; + vector<DeltaEdge_t *>::iterator j; + vector<DeltaEdgelet_t *>::iterator k; + + //-- For all ref nodes + for ( i = refnodes.begin(); i != refnodes.end(); ) + { + //-- For all edges + for ( j = i->second.edges.begin(); + j != i->second.edges.end(); ++ j ) + { + //-- For all edgelets + for ( k = (*j)->edgelets.begin(); + k != (*j)->edgelets.end(); ++ k ) + { + if ( ! (*k)->isGOOD ) + { + //-- Delete the bad edgelet and mark for erasure + delete (*k); + *k = NULL; + } + } + + //-- Erase the marked edgelets + k = partition ((*j)->edgelets.begin(), + (*j)->edgelets.end(), + NULLPred_t()); + (*j)->edgelets.erase (k, (*j)->edgelets.end()); + + //-- Mark the edge if empty + if ( (*j)->edgelets.empty() ) + *j = NULL; + } + + //-- Erase the marked edges + j = partition (i->second.edges.begin(), + i->second.edges.end(), + NULLPred_t()); + i->second.edges.erase (j, i->second.edges.end()); + + //-- Erase the node if empty + ii = i ++; + if ( ii->second.edges.empty() ) + refnodes.erase (ii); + } + + //-- For all qry nodes + for ( i = qrynodes.begin(); i != qrynodes.end(); ) + { + for ( j = i->second.edges.begin(); + j != i->second.edges.end(); ++ j ) + { + //-- Delete the edge if empty and mark for erasure + if ( (*j)->edgelets.empty() ) + { + delete (*j); + *j = NULL; + } + } + + //-- Erase the marked edges + j = partition (i->second.edges.begin(), + i->second.edges.end(), + NULLPred_t()); + i->second.edges.erase (j, i->second.edges.end()); + + //-- Erase the node if empty + ii = i ++; + if ( ii->second.edges.empty() ) + qrynodes.erase (ii); + } + +} + + +//------------------------------------------------------------------- clear ---- +//! \brief Clear the graph of all nodes, edges and edgelets +//! +//! \return void +//! +void DeltaGraph_t::clear ( ) +{ + //-- Make sure the edges only get destructed once + std::map<std::string, DeltaNode_t>::iterator i; + std::vector<DeltaEdge_t *>::iterator j; + for ( i = refnodes . begin( ); i != refnodes . end( ); ++ i ) + for ( j = i -> second . edges . begin( ); + j != i -> second . edges . end( ); ++ j ) + delete (*j); + + refnodes.clear( ); + qrynodes.clear( ); + refpath.erase( ); + qrypath.erase( ); + datatype = NULL_DATA; +} + + +//------------------------------------------------------------ getNodeCount ---- +//! \brief Counts and returns the number of graph nodes (sequences) +//! +//! \return The number of graph nodes +//! +long DeltaGraph_t::getNodeCount() +{ + long sum = refnodes.size() + qrynodes.size(); + return sum; +} + + +//------------------------------------------------------------ getEdgeCount ---- +//! \brief Counts and returns the number of graph edges +//! +//! \return void +//! +long DeltaGraph_t::getEdgeCount() +{ + long sum = 0; + + map<string, DeltaNode_t>::iterator i; + for ( i = refnodes.begin(); i != refnodes.end(); ++ i ) + sum += i->second.edges.size(); + + return sum; +} + + +//--------------------------------------------------------- getEdgeletCount ---- +//! \brief Counts and returns the number of graph edgelets (alignments) +//! +//! \return void +//! +long DeltaGraph_t::getEdgeletCount() +{ + long sum = 0; + + map<string, DeltaNode_t>::iterator i; + vector<DeltaEdge_t *>::iterator j; + for ( i = refnodes.begin(); i != refnodes.end(); ++ i ) + for ( j = i->second.edges.begin(); + j != i->second.edges.end(); ++ j ) + sum += (*j)->edgelets.size(); + + return sum; +} + + +//---------------------------------------------------------------- flagGOOD ---- +//! \brief Sets isGOOD = 1 for all edgelets +void DeltaGraph_t::flagGOOD() +{ + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + //-- All references + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- All queries matching this reference + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + { + //-- All alignments between reference and query + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + { + (*eli)->isGOOD = true; + } + } + } +} + + +//---------------------------------------------------------------- flag1to1 ---- +//! \brief Intersection of flagQLIS and flagRLIS +void DeltaGraph_t::flag1to1(float epsilon, float maxolap) +{ + flagRLIS(epsilon, maxolap, false); + flagQLIS(epsilon, maxolap, false); + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + //-- All references + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- All queries matching this reference + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + { + //-- All alignments between reference and query + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + { + if ( !(*eli)->isRLIS || !(*eli)->isQLIS ) + (*eli)->isGOOD = false; + } + } + } +} + + +//---------------------------------------------------------------- flagMtoM ---- +//! \brief Union of flagQLIS and flagRLIS +void DeltaGraph_t::flagMtoM(float epsilon, float maxolap) +{ + flagRLIS(epsilon, maxolap, false); + flagQLIS(epsilon, maxolap, false); + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + //-- All references + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- All queries matching this reference + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + { + //-- All alignments between reference and query + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + { + if ( !(*eli)->isRLIS && !(*eli)->isQLIS ) + (*eli)->isGOOD = false; + } + } + } +} + + +//-------------------------------------------------------------- flagGLIS ------ +//! \brief Flag edgelets in the global LIS and unflag those who are not +//! +//! Runs a length*identity weigthed LIS to determine the longest, mutually +//! consistent set of alignments between all pairs of sequences. This +//! essentially constructs the global alignment between all sequence pairs. +//! +//! Sets isGLIS flag for good and unsets isGOOD flag for bad. +//! +//! \param epsilon Keep repeat alignments within epsilon % of the best align +//! \return void +//! +void DeltaGraph_t::flagGLIS (float epsilon) +{ + LIS_t * lis = NULL; + long lis_size = 0; + long i, j, n; + long olap, olapQ, olapR, len, lenQ, lenR, score, diff; + + vector<DeltaEdgelet_t *> edgelets; + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + + //-- For each reference sequence + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- For each query aligning to this reference + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + { + //-- Collect all the good edgelets + edgelets.clear(); + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + { + edgelets.push_back (*eli); + + //-- Fix the coordinates to make global LIS work + if ( (*eli)->dirR == (*eli)->dirQ ) + { + (*eli)->dirQ = FORWARD_DIR; + } + else + { + if ( (*eli)->dirQ == REVERSE_DIR ) + Swap ((*eli)->loQ, (*eli)->hiQ); + (*eli)->loQ = RevC ((*eli)->loQ, (*ei)->qrynode->len); + (*eli)->hiQ = RevC ((*eli)->hiQ, (*ei)->qrynode->len); + (*eli)->dirQ = REVERSE_DIR; + } + } + + //-- Resize and initialize + n = edgelets.size(); + if ( n > lis_size ) + { + lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n); + lis_size = n; + } + for ( i = 0; i < n; ++ i ) + lis[i].used = false; + + //-- Sort by lo query coord + sort (edgelets.begin(), edgelets.end(), EdgeletQCmp_t()); + + //-- Continue until all equivalent repeats are extracted + vector<long> allbest; + do + { + //-- Dynamic + for ( i = 0; i < n; ++ i ) + { + if ( lis[i].used ) continue; + + lis[i].a = edgelets[i]; + + lenR = lis[i].a->hiR - lis[i].a->loR + 1; + lenQ = lis[i].a->hiQ - lis[i].a->loQ + 1; + len = lenR > lenQ ? lenQ : lenR; + lis[i].score = ScoreGlobal (0, len, 0, lis[i].a->idy); + + lis[i].from = -1; + lis[i].diff = 0; + + for ( j = 0; j < i; ++ j ) + { + if ( lis[j].used ) continue; + + if ( lis[i].a->dirQ != lis[j].a->dirQ ) + continue; + + lenR = lis[i].a->hiR - lis[i].a->loR + 1; + lenQ = lis[i].a->hiQ - lis[i].a->loQ + 1; + len = lenR > lenQ ? lenQ : lenR; + + olapR = lis[j].a->hiR - lis[i].a->loR + 1; + olapQ = lis[j].a->hiQ - lis[i].a->loQ + 1; + olap = olapR > olapQ ? olapR : olapQ; + if ( olap < 0 ) + olap = 0; + + diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a); + + score = ScoreGlobal + (lis[j].score, len, olap, lis[i].a->idy); + + if ( score > lis[i].score + || + (score == lis[i].score && diff < lis[i].diff) ) + { + lis[i].from = j; + lis[i].score = score; + lis[i].diff = diff; + } + } + } + } while ( UpdateBest (lis, n, allbest, epsilon) ); + + long beg = PickBest (lis, allbest, epsilon); + long end = allbest.size(); + if ( beg == end ) beg = 0; + else end = beg + 1; + + //-- Flag the edgelets + for ( ; beg < end; ++ beg ) + for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from ) + lis[i].a->isGLIS = true; + + //-- Repair the coordinates + for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli ) + { + if ( ! (*eli)->isGLIS ) + (*eli)->isGOOD = false; + + if ( (*eli)->dirQ == FORWARD_DIR ) + { + (*eli)->dirQ = (*eli)->dirR; + } + else + { + if ( (*eli)->dirR == FORWARD_DIR ) + Swap ((*eli)->loQ, (*eli)->hiQ); + (*eli)->loQ = RevC ((*eli)->loQ, (*ei)->qrynode->len); + (*eli)->hiQ = RevC ((*eli)->hiQ, (*ei)->qrynode->len); + (*eli)->dirQ = + (*eli)->dirR == FORWARD_DIR ? REVERSE_DIR : FORWARD_DIR; + } + } + } + } + + free (lis); +} + + +//-------------------------------------------------------------- flagQLIS ------ +//! \brief Flag edgelets in the query LIS and unflag those who are not +//! +//! Runs a length*identity weighted LIS to determine the longest, QUERY +//! consistent set of alignments for all query sequences. This effectively +//! identifies the "best" alignments for all positions of each query. +//! +//! Sets isQLIS flag for good and unsets isGOOD flag for bad. +//! +//! \param epsilon Keep repeat alignments within epsilon % of the best align +//! \param maxolap Only allow alignments to overlap by maxolap percent [0-100] +//! \param flagBad Flag non QLIS edgelets as !isGOOD +//! \return void +//! +void DeltaGraph_t::flagQLIS (float epsilon, float maxolap, bool flagbad) +{ + LIS_t * lis = NULL; + long lis_size = 0; + long i, j, n; + long olap, leni, lenj, score, diff; + + vector<DeltaEdgelet_t *> edgelets; + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + + //-- For each query sequence + for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi ) + { + //-- Collect all the good edgelets + edgelets.clear(); + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + edgelets.push_back (*eli); + + //-- Resize and initialize + n = edgelets.size(); + if ( n > lis_size ) + { + lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n); + lis_size = n; + } + for ( i = 0; i < n; ++ i ) + lis[i].used = false; + + //-- Sort by lo query coord + sort (edgelets.begin(), edgelets.end(), EdgeletQCmp_t()); + + //-- Continue until all equivalent repeats are extracted + vector<long> allbest; + do + { + //-- Dynamic + for ( i = 0; i < n; ++ i ) + { + if ( lis[i].used ) continue; + + lis[i].a = edgelets[i]; + + leni = lis[i].a->hiQ - lis[i].a->loQ + 1; + lis[i].score = + ScoreLocal (0, leni, 0, 0, lis[i].a->idy, 0); + + lis[i].from = -1; + lis[i].diff = 0; + + for ( j = 0; j < i; ++ j ) + { + if ( lis[j].used ) continue; + if ( lis[j].from >= 0 && + lis[lis[j].from].a->hiQ >= lis[i].a->loQ ) + continue; + + leni = lis[i].a->hiQ - lis[i].a->loQ + 1; + lenj = lis[j].a->hiQ - lis[j].a->loQ + 1; + olap = lis[j].a->hiQ - lis[i].a->loQ + 1; + if ( olap < 0 ) + olap = 0; + + diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a); + + score = ScoreLocal + (lis[j].score, leni, lenj, olap, lis[i].a->idy, maxolap); + + if ( score > lis[i].score + || + (score == lis[i].score && diff < lis[i].diff) ) + { + lis[i].from = j; + lis[i].score = score; + lis[i].diff = diff; + } + } + } + } while ( UpdateBest (lis, n, allbest, epsilon) ); + + long beg = PickBest (lis, allbest, epsilon); + long end = allbest.size(); + if ( beg == end ) beg = 0; + else end = beg + 1; + + //-- Flag the edgelets + for ( ; beg < end; ++ beg ) + for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from ) + lis[i].a->isQLIS = true; + + if ( flagbad ) + for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli ) + if ( ! (*eli)->isQLIS ) + (*eli)->isGOOD = false; + } + + free (lis); +} + + +//-------------------------------------------------------------- flagRLIS ------ +//! \brief Flag edgelets in the reference LIS and unflag those who are not +//! +//! Runs a length*identity weighted LIS to determine the longest, REFERENCE +//! consistent set of alignments for all reference sequences. This effectively +//! identifies the "best" alignments for all positions of each reference. +//! +//! Sets isRLIS flag for good and unsets isGOOD flag for bad. +//! +//! \param epsilon Keep repeat alignments within epsilon % of the best align +//! \param maxolap Only allow alignments to overlap by maxolap percent [0-100] +//! \param flagBad Flag non RLIS edgelets as !isGOOD +//! \return void +//! +void DeltaGraph_t::flagRLIS (float epsilon, float maxolap, bool flagbad) +{ + LIS_t * lis = NULL; + long lis_size = 0; + long i, j, n; + long olap, leni, lenj, score, diff; + + vector<DeltaEdgelet_t *> edgelets; + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + + //-- For each reference sequence + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- Collect all the good edgelets + edgelets.clear(); + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + edgelets.push_back (*eli); + + //-- Resize + n = edgelets.size(); + if ( n > lis_size ) + { + lis = (LIS_t *) Safe_realloc (lis, sizeof (LIS_t) * n); + lis_size = n; + } + for ( i = 0; i < n; ++ i ) + lis[i].used = false; + + //-- Sort by lo reference coord + sort (edgelets.begin(), edgelets.end(), EdgeletRCmp_t()); + + //-- Continue until all equivalent repeats are extracted + vector<long> allbest; + do + { + //-- Dynamic + for ( i = 0; i < n; ++ i ) + { + if ( lis[i].used ) continue; + + lis[i].a = edgelets[i]; + + leni = lis[i].a->hiR - lis[i].a->loR + 1; + lis[i].score = + ScoreLocal (0, leni, 0, 0, lis[i].a->idy, 0); + + lis[i].from = -1; + lis[i].diff = 0; + + for ( j = 0; j < i; ++ j ) + { + if ( lis[j].used ) continue; + if ( lis[j].from >= 0 && + lis[lis[j].from].a->hiR >= lis[i].a->loR ) + continue; + + leni = lis[i].a->hiR - lis[i].a->loR + 1; + lenj = lis[j].a->hiR - lis[j].a->loR + 1; + olap = lis[j].a->hiR - lis[i].a->loR + 1; + if ( olap < 0 ) + olap = 0; + + diff = lis[j].diff + DiffAligns (lis[i].a, lis[j].a); + + score = ScoreLocal + (lis[j].score, leni, lenj, olap, lis[i].a->idy, maxolap); + + if ( score > lis[i].score + || + (score == lis[i].score && diff < lis[i].diff) ) + { + lis[i].from = j; + lis[i].score = score; + lis[i].diff = diff; + } + } + } + } while ( UpdateBest (lis, n, allbest, epsilon) ); + + long beg = PickBest (lis, allbest, epsilon); + long end = allbest.size(); + if ( beg == end ) beg = 0; + else end = beg + 1; + + //-- Flag the edgelets + for ( ; beg < end; ++ beg ) + for ( i = allbest[beg]; i >= 0 && i < n; i = lis[i].from ) + lis[i].a->isRLIS = true; + + if ( flagbad ) + for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli ) + if ( ! (*eli)->isRLIS ) + (*eli)->isGOOD = false; + } + + free (lis); +} + + +//------------------------------------------------------------- flagScore ------ +//! \brief Flag edgelets with scores below a score threshold +//! +//! Unsets isGOOD for bad. +//! +//! \param minlen Flag edgelets if less than minlen in length +//! \param minidy Flag edgelets if less than minidy identity [0-100] +//! \return void +//! +void DeltaGraph_t::flagScore (long minlen, float minidy) +{ + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + { + //-- Flag low identities + if ( (*eli)->idy * 100.0 < minidy ) + (*eli)->isGOOD = false; + + //-- Flag small lengths + if ( (*eli)->hiR - (*eli)->loR + 1 < minlen || + (*eli)->hiQ - (*eli)->loQ + 1 < minlen ) + (*eli)->isGOOD = false; + } +} + + +//-------------------------------------------------------------- flagUNIQ ------ +//! \brief Flag edgelets with uniqueness below a certain threshold +//! +//! Unsets isGOOD for bad. +//! +//! \param minuniq Flag edgelets if less that minuniq percent [0-100] unique +//! \return void +//! +void DeltaGraph_t::flagUNIQ (float minuniq) +{ + long i, uniq, len; + + vector<DeltaEdgelet_t *> edgelets; + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::iterator eli; + + + //-- For each reference sequence + long ref_size = 0; + long ref_len = 0; + unsigned char * ref_cov = NULL; + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + { + //-- Reset the reference coverage array + ref_len = (mi->second).len; + if ( ref_len > ref_size ) + { + ref_cov = (unsigned char *) Safe_realloc (ref_cov, ref_len + 1); + ref_size = ref_len; + } + for ( i = 1; i <= ref_len; ++ i ) + ref_cov[i] = 0; + + //-- Collect all the good edgelets + edgelets.clear(); + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + { + edgelets.push_back (*eli); + + //-- Add to the reference coverage + for ( i = (*eli)->loR; i <= (*eli)->hiR; i ++ ) + if ( ref_cov[i] < UCHAR_MAX ) + ref_cov[i] ++; + } + + //-- Calculate the uniqueness of each edgelet + for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli ) + { + uniq = 0; + len = (*eli)->hiR - (*eli)->loR + 1; + for ( i = (*eli)->loR; i <= (*eli)->hiR; i ++ ) + if ( ref_cov[i] == 1 ) + uniq ++; + + //-- Flag low reference uniqueness + if ( (float)uniq / (float)len * 100.0 < minuniq ) + (*eli)->isGOOD = false; + } + } + free (ref_cov); + + + //-- For each query sequence + long qry_size = 0; + long qry_len = 0; + unsigned char * qry_cov = NULL; + for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi ) + { + //-- Reset the query coverage array + qry_len = (mi->second).len; + if ( qry_len > qry_size ) + { + qry_cov = (unsigned char *) Safe_realloc (qry_cov, qry_len + 1); + qry_size = qry_len; + } + for ( i = 1; i <= qry_len; ++ i ) + qry_cov[i] = 0; + + //-- Collect all the good edgelets + edgelets.clear(); + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + if ( (*eli)->isGOOD ) + { + edgelets.push_back (*eli); + + //-- Add to the query coverage + for ( i = (*eli)->loQ; i <= (*eli)->hiQ; i ++ ) + if ( qry_cov[i] < UCHAR_MAX ) + qry_cov[i] ++; + } + + //-- Calculate the uniqueness of each edgelet + for ( eli = edgelets.begin(); eli != edgelets.end(); ++ eli ) + { + uniq = 0; + len = (*eli)->hiQ - (*eli)->loQ + 1; + for ( i = (*eli)->loQ; i <= (*eli)->hiQ; i ++ ) + if ( qry_cov[i] == 1 ) + uniq ++; + + //-- Flag low query uniqueness + if ( (float)uniq / (float)len * 100.0 < minuniq ) + (*eli)->isGOOD = false; + } + } + free (qry_cov); +} + + +//----------------------------------------------------- loadSequences ---------- +//! \brief Load the sequence information into the DeltaNodes +//! +//! \return void +//! +void DeltaGraph_t::loadSequences () +{ + map<string, DeltaNode_t>::iterator mi; + + FILE * qryfile, * reffile; + char * R = NULL; + char * Q = NULL; + char id [MAX_LINE]; + long initsize; + long len; + + //-- Read in the reference sequences + reffile = File_Open (refpath.c_str(), "r"); + initsize = INIT_SIZE; + R = (char *) Safe_malloc (initsize); + while ( Read_String (reffile, R, initsize, id, FALSE) ) + if ( (mi = refnodes.find (id)) != refnodes.end() ) + { + len = strlen (R + 1); + mi->second.seq = (char *) Safe_malloc (len + 2); + mi->second.seq[0] = '\0'; + strcpy (mi->second.seq + 1, R + 1); + if ( len != mi->second.len ) + { + cerr << "ERROR: Reference input does not match delta file\n"; + exit (EXIT_FAILURE); + } + } + fclose (reffile); + free (R); + + //-- Read in the query sequences + qryfile = File_Open (qrypath.c_str(), "r"); + initsize = INIT_SIZE; + Q = (char *) Safe_malloc (initsize); + while ( Read_String (qryfile, Q, initsize, id, FALSE) ) + if ( (mi = qrynodes.find (id)) != qrynodes.end() ) + { + len = strlen (Q + 1); + mi->second.seq = (char *) Safe_malloc (len + 2); + mi->second.seq[0] = '\0'; + strcpy (mi->second.seq + 1, Q + 1); + if ( len != mi->second.len ) + { + cerr << "ERROR: Query input does not match delta file\n"; + exit (EXIT_FAILURE); + } + } + fclose (qryfile); + free (Q); + + + //-- Check that we found all the sequences + for ( mi = refnodes.begin(); mi != refnodes.end(); ++ mi ) + if ( mi->second.seq == NULL ) + { + cerr << "ERROR: '" << mi->first << "' not found in reference file\n"; + exit (EXIT_FAILURE); + } + + for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi ) + if ( mi->second.seq == NULL ) + { + cerr << "ERROR: '" << mi->first << "' not found in query file\n"; + exit (EXIT_FAILURE); + } +} + + +//----------------------------------------------------- outputDelta ------------ +//! \brief Outputs the contents of the graph as a deltafile +//! +//! \param out The output stream to write to +//! \return The output stream +//! +ostream & DeltaGraph_t::outputDelta (ostream & out) +{ + bool header; + long s1, e1, s2, e2; + + map<string, DeltaNode_t>::const_iterator mi; + vector<DeltaEdge_t *>::const_iterator ei; + vector<DeltaEdgelet_t *>::const_iterator eli; + + //-- Print the file header + cout + << refpath << ' ' << qrypath << '\n' + << (datatype == PROMER_DATA ? PROMER_STRING : NUCMER_STRING) << '\n'; + + for ( mi = qrynodes.begin(); mi != qrynodes.end(); ++ mi ) + { + for ( ei = (mi->second).edges.begin(); + ei != (mi->second).edges.end(); ++ ei ) + { + header = false; + + for ( eli = (*ei)->edgelets.begin(); + eli != (*ei)->edgelets.end(); ++ eli ) + { + if ( ! (*eli)->isGOOD ) + continue; + + //-- Print the sequence header + if ( ! header ) + { + cout + << '>' + << *((*ei)->refnode->id) << ' ' + << *((*ei)->qrynode->id) << ' ' + << (*ei)->refnode->len << ' ' + << (*ei)->qrynode->len << '\n'; + header = true; + } + //-- Print the alignment + s1 = (*eli)->loR; + e1 = (*eli)->hiR; + s2 = (*eli)->loQ; + e2 = (*eli)->hiQ; + if ( (*eli)->dirR == REVERSE_DIR ) + Swap (s1, e1); + if ( (*eli)->dirQ == REVERSE_DIR ) + Swap (s2, e2); + + cout + << s1 << ' ' << e1 << ' ' << s2 << ' ' << e2 << ' ' + << (*eli)->idyc << ' ' + << (*eli)->simc << ' ' + << (*eli)->stpc << '\n' + << (*eli)->delta; + } + } + } + return out; +}