jpayne@69: //----------------------------------------------------------------------------- jpayne@69: // Programmer: Adam M Phillippy, University of Maryland jpayne@69: // File: show-diff.cc jpayne@69: // Date: 12 / 01 / 2006 jpayne@69: // jpayne@69: // Usage: show-diff [options] jpayne@69: // Try 'show-diff -h' for more information jpayne@69: // jpayne@69: //----------------------------------------------------------------------------- jpayne@69: jpayne@69: #include "delta.hh" jpayne@69: #include "tigrinc.hh" jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: jpayne@69: using namespace std; jpayne@69: jpayne@69: jpayne@69: //================================================================ Options ==== jpayne@69: string OPT_AlignName; // delta file name jpayne@69: bool OPT_AMOS = false; // AMOS output jpayne@69: bool OPT_RefDiff = false; // output reference diff jpayne@69: bool OPT_QryDiff = false; // output query diff jpayne@69: bool OPT_PrintHeader = true; // -H option jpayne@69: jpayne@69: jpayne@69: //=========================================================== Declarations ==== jpayne@69: struct EdgeletLoQCmp_t jpayne@69: //!< Sorts query by lo coord, lo to hi jpayne@69: { jpayne@69: bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const jpayne@69: { return ( i->loQ < j->loQ ); } jpayne@69: }; jpayne@69: jpayne@69: struct EdgeletIdQLoQCmp_t jpayne@69: //!< Sorts query by ID and lo coord, lo to hi jpayne@69: { jpayne@69: bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const jpayne@69: { jpayne@69: if ( i->edge && j->edge ) jpayne@69: { jpayne@69: if ( i->edge->qrynode < j->edge->qrynode ) jpayne@69: return true; jpayne@69: else if ( i->edge->qrynode > j->edge->qrynode ) jpayne@69: return false; jpayne@69: } jpayne@69: else if ( !i->edge && j->edge ) jpayne@69: return true; jpayne@69: else if ( i->edge && !j->edge ) jpayne@69: return false; jpayne@69: return ( i->loQ < j->loQ ); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: struct EdgeletLoRCmp_t jpayne@69: //!< Sorts reference by lo coord, lo to hi jpayne@69: { jpayne@69: bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const jpayne@69: { return ( i->loR < j->loR ); } jpayne@69: }; jpayne@69: jpayne@69: struct EdgeletIdRLoRCmp_t jpayne@69: //!< Sorts reference by ID and lo coord, lo to hi jpayne@69: { jpayne@69: bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const jpayne@69: { jpayne@69: if ( i->edge && j->edge ) jpayne@69: { jpayne@69: if ( i->edge->refnode < j->edge->refnode ) jpayne@69: return true; jpayne@69: else if ( i->edge->refnode > j->edge->refnode ) jpayne@69: return false; jpayne@69: } jpayne@69: else if ( !i->edge && j->edge ) jpayne@69: return true; jpayne@69: else if ( i->edge && !j->edge ) jpayne@69: return false; jpayne@69: return ( i->loR < j->loR ); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: jpayne@69: void PrintDiff(DeltaGraph_t & graph); jpayne@69: void PrintBrk(const char* seq, long s, long e); jpayne@69: void PrintSeqJmp(const char* seq, jpayne@69: const char* seqp, const char* seqn, jpayne@69: long s, long e); jpayne@69: void PrintLisJmp(const char* seq, long s, long e); jpayne@69: void PrintInv(const char* seq, long s, long e); jpayne@69: void PrintGap(const char* seq, long s, long e, long gap1, long gap2); jpayne@69: void PrintDup(const char* seq, long s, long e); jpayne@69: void ParseArgs(int argc, char ** argv); jpayne@69: void PrintHelp(const char * s); jpayne@69: void PrintUsage(const char * s); jpayne@69: jpayne@69: jpayne@69: //============================================================ Definitions ==== jpayne@69: //------------------------------------------------------------------- main ---- jpayne@69: int main(int argc, char **argv) jpayne@69: { jpayne@69: DeltaGraph_t graph; jpayne@69: jpayne@69: ParseArgs(argc, argv); jpayne@69: jpayne@69: //-- Get M-to-M alignment, i.e. union of QLIS and RLIS jpayne@69: graph.build(OPT_AlignName, false); jpayne@69: graph.flagMtoM(); jpayne@69: graph.clean(); jpayne@69: jpayne@69: //-- Output diff jpayne@69: PrintDiff(graph); jpayne@69: jpayne@69: return EXIT_SUCCESS; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: //-------------------------------------------------------------- PrintDiff ---- jpayne@69: void PrintDiff(DeltaGraph_t & graph) jpayne@69: { jpayne@69: if ( OPT_PrintHeader ) jpayne@69: { jpayne@69: printf ("%s %s\n%s\n\n", jpayne@69: graph . refpath . c_str( ), graph . qrypath . c_str( ), jpayne@69: graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER"); jpayne@69: printf ("%s\t%s\t%s\t%s\t%s\n", jpayne@69: "[SEQ]", "[TYPE]", "[S1]", "[E1]", "[LEN 1]"); jpayne@69: } jpayne@69: jpayne@69: const char* refid; jpayne@69: const char* qryid; jpayne@69: long i,j; jpayne@69: long nAligns, gapR, gapQ; jpayne@69: DeltaEdgelet_t lpad, rpad; // padding for the alignment vector jpayne@69: lpad.isRLIS = rpad.isRLIS = true; jpayne@69: lpad.isQLIS = rpad.isQLIS = true; jpayne@69: lpad.loR = lpad.hiR = lpad.loQ = lpad.hiQ = 0; jpayne@69: jpayne@69: DeltaEdgelet_t *A, *PA, *PGA; // alignment, prev, prev global jpayne@69: vector aligns; jpayne@69: jpayne@69: map::const_iterator mi; jpayne@69: vector::const_iterator ei; jpayne@69: vector::iterator eli; jpayne@69: jpayne@69: //-- For each reference sequence jpayne@69: if ( OPT_RefDiff ) jpayne@69: for ( mi = graph.refnodes.begin(); mi != graph.refnodes.end(); ++ mi ) jpayne@69: { jpayne@69: refid = mi->first.c_str(); jpayne@69: jpayne@69: //-- Collect all alignments for this reference sequence jpayne@69: aligns.clear(); jpayne@69: for ( ei = (mi->second).edges.begin(); jpayne@69: ei != (mi->second).edges.end(); ++ei ) jpayne@69: for ( eli = (*ei)->edgelets.begin(); jpayne@69: eli != (*ei)->edgelets.end(); ++eli ) jpayne@69: aligns.push_back(*eli); jpayne@69: jpayne@69: //-- Pad the front and back of the alignment vector jpayne@69: rpad.loR = rpad.hiR = mi->second.len + 1; jpayne@69: rpad.loQ = rpad.hiQ = LONG_MAX; jpayne@69: aligns.push_back(&lpad); jpayne@69: aligns.push_back(&rpad); jpayne@69: jpayne@69: nAligns = aligns.size(); jpayne@69: jpayne@69: //-- OVERRIDE *stpc* value with loQ QLIS ordering jpayne@69: sort(aligns.begin(), aligns.end(), EdgeletIdQLoQCmp_t()); jpayne@69: for ( i = 0, j = 0; i != nAligns; ++i ) jpayne@69: aligns[i]->stpc = aligns[i]->isQLIS ? j++ : -1; jpayne@69: jpayne@69: //-- Sort by reference order jpayne@69: sort(aligns.begin(), aligns.end(), EdgeletLoRCmp_t()); jpayne@69: assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad ); jpayne@69: jpayne@69: //-- Walk reference cover alignments, low to high jpayne@69: PA = PGA = aligns[0]; jpayne@69: for ( i = 1; i != nAligns; ++i ) jpayne@69: { jpayne@69: //-- Only interested in reference covering alignments jpayne@69: if ( !aligns[i]->isRLIS ) continue; jpayne@69: jpayne@69: A = aligns[i]; jpayne@69: gapR = A->loR - PA->hiR - 1; jpayne@69: jpayne@69: //-- Reached end of alignments jpayne@69: if ( A->edge == NULL ) jpayne@69: { jpayne@69: PrintBrk(refid, PA->hiR+1, A->loR-1); jpayne@69: } jpayne@69: //-- 1-to-1 alignment jpayne@69: else if ( A->isQLIS && A->edge == PGA->edge ) jpayne@69: { jpayne@69: //-- Jump within Q jpayne@69: if ( A->slope() != PGA->slope() || jpayne@69: A->stpc != PGA->stpc + PGA->slope() ) jpayne@69: { jpayne@69: if ( A->slope() == PGA->slope() ) jpayne@69: PrintLisJmp(refid, PA->hiR+1, A->loR-1); jpayne@69: else jpayne@69: PrintInv(refid, PA->hiR+1, A->loR-1); jpayne@69: } jpayne@69: //-- Lined up, nothing in between jpayne@69: else if ( PA == PGA ) jpayne@69: { jpayne@69: gapQ = A->isPositive() ? jpayne@69: A->loQ - PGA->hiQ - 1 : jpayne@69: PGA->loQ - A->hiQ - 1; jpayne@69: PrintGap(refid, PA->hiR+1, A->loR-1, gapR, gapQ); jpayne@69: } jpayne@69: //-- Lined up, duplication in between jpayne@69: else jpayne@69: { jpayne@69: PrintBrk(refid, PA->hiR+1, A->loR-1); jpayne@69: } jpayne@69: } jpayne@69: //-- Not in QLIS? Must be a duplication in R jpayne@69: else if ( !A->isQLIS ) jpayne@69: { jpayne@69: PrintBrk(refid, PA->hiR+1, A->loR-1); jpayne@69: PrintDup(refid, A->loR, A->hiR); jpayne@69: } jpayne@69: //-- A->edge != PGA->edge? Jump to different query sequence jpayne@69: else if ( PGA->edge != NULL ) jpayne@69: { jpayne@69: PrintSeqJmp(refid, jpayne@69: PGA->edge->qrynode->id->c_str(), jpayne@69: A->edge->qrynode->id->c_str(), jpayne@69: PA->hiR+1, A->loR-1); jpayne@69: } jpayne@69: //-- Gap before first alignment jpayne@69: else jpayne@69: { jpayne@69: PrintBrk(refid, PA->hiR+1, A->loR-1); jpayne@69: } jpayne@69: jpayne@69: if ( A->isQLIS ) jpayne@69: PGA = A; jpayne@69: PA = A; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: jpayne@69: //---------- WARNING! Same code as above but Q's for R's and R's for Q's jpayne@69: if ( OPT_QryDiff ) jpayne@69: for ( mi = graph.qrynodes.begin(); mi != graph.qrynodes.end(); ++ mi ) jpayne@69: { jpayne@69: qryid = mi->first.c_str(); jpayne@69: jpayne@69: aligns.clear(); jpayne@69: for ( ei = (mi->second).edges.begin(); jpayne@69: ei != (mi->second).edges.end(); ++ei ) jpayne@69: for ( eli = (*ei)->edgelets.begin(); jpayne@69: eli != (*ei)->edgelets.end(); ++eli ) jpayne@69: aligns.push_back(*eli); jpayne@69: jpayne@69: rpad.loQ = rpad.hiQ = mi->second.len + 1; jpayne@69: rpad.loR = rpad.hiR = LONG_MAX; jpayne@69: aligns.push_back(&lpad); jpayne@69: aligns.push_back(&rpad); jpayne@69: jpayne@69: nAligns = aligns.size(); jpayne@69: jpayne@69: sort(aligns.begin(), aligns.end(), EdgeletIdRLoRCmp_t()); jpayne@69: for ( i = 0, j = 0; i != nAligns; ++i ) jpayne@69: aligns[i]->stpc = aligns[i]->isRLIS ? j++ : -1; jpayne@69: jpayne@69: sort(aligns.begin(), aligns.end(), EdgeletLoQCmp_t()); jpayne@69: assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad ); jpayne@69: jpayne@69: PA = PGA = aligns[0]; jpayne@69: for ( i = 1; i != nAligns; ++i ) jpayne@69: { jpayne@69: if ( !aligns[i]->isQLIS ) continue; jpayne@69: jpayne@69: A = aligns[i]; jpayne@69: gapQ = A->loQ - PA->hiQ - 1; jpayne@69: jpayne@69: if ( A->edge == NULL ) jpayne@69: { jpayne@69: PrintBrk(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: } jpayne@69: else if ( A->isRLIS && A->edge == PGA->edge ) jpayne@69: { jpayne@69: if ( A->slope() != PGA->slope() || jpayne@69: A->stpc != PGA->stpc + PGA->slope() ) jpayne@69: { jpayne@69: if ( A->slope() == PGA->slope() ) jpayne@69: PrintLisJmp(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: else jpayne@69: PrintInv(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: } jpayne@69: else if ( PA == PGA ) jpayne@69: { jpayne@69: gapR = A->isPositive() ? jpayne@69: A->loR - PGA->hiR - 1 : jpayne@69: PGA->loR - A->hiR - 1; jpayne@69: PrintGap(qryid, PA->hiQ+1, A->loQ-1, gapQ, gapR); jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: PrintBrk(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: } jpayne@69: } jpayne@69: else if ( !A->isRLIS ) jpayne@69: { jpayne@69: PrintBrk(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: PrintDup(qryid, A->loQ, A->hiQ); jpayne@69: } jpayne@69: else if ( PGA->edge != NULL ) jpayne@69: { jpayne@69: PrintSeqJmp(qryid, jpayne@69: PA->edge->refnode->id->c_str(), jpayne@69: A->edge->refnode->id->c_str(), jpayne@69: PA->hiQ+1, A->loQ-1); jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: PrintBrk(qryid, PA->hiQ+1, A->loQ-1); jpayne@69: } jpayne@69: jpayne@69: if ( A->isRLIS ) jpayne@69: PGA = A; jpayne@69: PA = A; jpayne@69: } jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: jpayne@69: void PrintBrk(const char* seq, long s, long e) jpayne@69: { jpayne@69: if ( e-s+1 <= 0 ) return; jpayne@69: jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tBRK\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, e-s+1); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tBRK\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, s, e, e-s+1); jpayne@69: } jpayne@69: jpayne@69: void PrintSeqJmp(const char* seq, jpayne@69: const char* seqp, jpayne@69: const char* seqn, jpayne@69: long s, long e) jpayne@69: { jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n", jpayne@69: seq, s, e, e-s+1, seqp, seqn); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n", jpayne@69: seq, s, e, s, e, e-s+1, seqp, seqn); jpayne@69: } jpayne@69: jpayne@69: void PrintLisJmp(const char* seq, long s, long e) jpayne@69: { jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tJMP\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, e-s+1); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tJMP\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, s, e, e-s+1); jpayne@69: } jpayne@69: jpayne@69: void PrintInv(const char* seq, long s, long e) jpayne@69: { jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tINV\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, e-s+1); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tINV\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, s, e, e-s+1); jpayne@69: } jpayne@69: jpayne@69: void PrintGap(const char* seq, long s, long e, long gap1, long gap2) jpayne@69: { jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, gap1, gap2, gap1-gap2); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, s, e, gap1, gap2, gap1-gap2); jpayne@69: } jpayne@69: jpayne@69: void PrintDup(const char* seq, long s, long e) jpayne@69: { jpayne@69: if ( !OPT_AMOS ) jpayne@69: printf("%s\tDUP\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, e-s+1); jpayne@69: else jpayne@69: printf("%s\tA\t%ld\t%ld\tDUP\t%ld\t%ld\t%ld\n", jpayne@69: seq, s, e, s, e, e-s+1); jpayne@69: } jpayne@69: jpayne@69: jpayne@69: //-------------------------------------------------------------- ParseArgs ---- jpayne@69: void ParseArgs (int argc, char ** argv) jpayne@69: { jpayne@69: int ch, errflg = 0; jpayne@69: optarg = NULL; jpayne@69: jpayne@69: while ( !errflg && jpayne@69: ((ch = getopt (argc, argv, "fhHqr")) != EOF) ) jpayne@69: switch (ch) jpayne@69: { jpayne@69: case 'f': jpayne@69: OPT_AMOS = true; jpayne@69: break; jpayne@69: jpayne@69: case 'h': jpayne@69: PrintHelp (argv[0]); jpayne@69: exit (EXIT_SUCCESS); jpayne@69: break; jpayne@69: jpayne@69: case 'H': jpayne@69: OPT_PrintHeader = false; jpayne@69: break; jpayne@69: jpayne@69: case 'q': jpayne@69: OPT_QryDiff = true; jpayne@69: break; jpayne@69: jpayne@69: case 'r': jpayne@69: OPT_RefDiff = true; jpayne@69: break; jpayne@69: jpayne@69: default: jpayne@69: errflg ++; jpayne@69: } jpayne@69: jpayne@69: if ( OPT_RefDiff && OPT_QryDiff ) jpayne@69: { jpayne@69: cerr << "ERROR: -r and -q options cannot be combined\n"; jpayne@69: errflg++; jpayne@69: } jpayne@69: if ( !OPT_RefDiff && !OPT_QryDiff ) jpayne@69: OPT_RefDiff = true; jpayne@69: jpayne@69: if ( errflg > 0 || optind != argc - 1 ) jpayne@69: { jpayne@69: PrintUsage(argv[0]); jpayne@69: cerr << "Try '" << argv[0] << " -h' for more information.\n"; jpayne@69: exit(EXIT_FAILURE); jpayne@69: } jpayne@69: jpayne@69: OPT_AlignName = argv [optind ++]; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: //-------------------------------------------------------------- PrintHelp ---- jpayne@69: void PrintHelp (const char * s) jpayne@69: { jpayne@69: PrintUsage (s); jpayne@69: cerr jpayne@69: << "-f Output diff information as AMOS features\n" jpayne@69: << "-h Display help information\n" jpayne@69: << "-H Do not show header\n" jpayne@69: << "-q Show diff information for queries\n" jpayne@69: << "-r Show diff information for references (default)\n" jpayne@69: << endl; jpayne@69: jpayne@69: cerr jpayne@69: << " Outputs a list of structural differences for each sequence in\n" jpayne@69: << "the reference and query, sorted by position. For a reference\n" jpayne@69: << "sequence R, and its matching query sequence Q, differences are\n" jpayne@69: << "categorized as GAP (gap between two mutually consistent alignments),\n" jpayne@69: << "DUP (inserted duplication), BRK (other inserted sequence), JMP\n" jpayne@69: << "(rearrangement), INV (rearrangement with inversion), SEQ\n" jpayne@69: << "(rearrangement with another sequence). The first five columns of\n" jpayne@69: << "the output are seq ID, feature type, feature start, feature end,\n" jpayne@69: << "and feature length. Additional columns are added depending on the\n" jpayne@69: << "feature type. Negative feature lengths indicate overlapping adjacent\n" jpayne@69: << "alignment blocks.\n" jpayne@69: << " IDR GAP gap-start gap-end gap-length-R gap-length-Q gap-diff\n" jpayne@69: << " IDR DUP dup-start dup-end dup-length\n" jpayne@69: << " IDR BRK gap-start gap-end gap-length\n" jpayne@69: << " IDR JMP gap-start gap-end gap-length\n" jpayne@69: << " IDR INV gap-start gap-end gap-length\n" jpayne@69: << " IDR SEQ gap-start gap-end gap-length prev-sequence next-sequence\n" jpayne@69: << "Positions always reference the sequence with the given ID. The\n" jpayne@69: << "sum of the fifth column (ignoring negative values) is the total\n" jpayne@69: << "amount of inserted sequence. Summing the fifth column after removing\n" jpayne@69: << "DUP features is total unique inserted sequence. Note that unaligned\n" jpayne@69: << "sequence are not counted, and could represent additional \"unique\"\n" jpayne@69: << "sequences. See documentation for tips on how to interpret these\n" jpayne@69: << "alignment break features.\n" jpayne@69: << endl; jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: //------------------------------------------------------------- PrintUsage ---- jpayne@69: void PrintUsage (const char * s) jpayne@69: { jpayne@69: cerr jpayne@69: << "\nUSAGE: " << s << " [options] \n\n"; jpayne@69: return; jpayne@69: }