diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-diff.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-diff.cc	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,503 @@
+//-----------------------------------------------------------------------------
+//   Programmer: Adam M Phillippy, University of Maryland
+//         File: show-diff.cc
+//         Date: 12 / 01 / 2006
+//
+//        Usage: show-diff [options] <deltafile>
+//               Try 'show-diff -h' for more information
+//
+//-----------------------------------------------------------------------------
+
+#include "delta.hh"
+#include "tigrinc.hh"
+#include <string>
+#include <cstdlib>
+#include <cassert>
+#include <climits>
+#include <algorithm>
+#include <vector>
+
+using namespace std;
+
+
+//================================================================ Options ====
+string  OPT_AlignName;            // delta file name
+bool    OPT_AMOS    = false;      // AMOS output
+bool    OPT_RefDiff = false;      // output reference diff
+bool    OPT_QryDiff = false;      // output query diff
+bool    OPT_PrintHeader = true;   // -H option
+
+
+//=========================================================== Declarations ====
+struct EdgeletLoQCmp_t
+//!< Sorts query by lo coord, lo to hi
+{
+  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
+  { return ( i->loQ < j->loQ ); }
+};
+
+struct EdgeletIdQLoQCmp_t
+//!< Sorts query by ID and lo coord, lo to hi
+{
+  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
+  {
+    if ( i->edge && j->edge )
+      {
+        if ( i->edge->qrynode < j->edge->qrynode )
+          return true;
+        else if ( i->edge->qrynode > j->edge->qrynode )
+          return false;
+      }
+    else if ( !i->edge && j->edge )
+      return true;
+    else if ( i->edge && !j->edge )
+      return false;
+    return ( i->loQ < j->loQ );
+  }
+};
+
+struct EdgeletLoRCmp_t
+//!< Sorts reference by lo coord, lo to hi
+{
+  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
+  { return ( i->loR < j->loR ); }
+};
+
+struct EdgeletIdRLoRCmp_t
+//!< Sorts reference by ID and lo coord, lo to hi
+{
+  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
+  {
+    if ( i->edge && j->edge )
+      {
+        if ( i->edge->refnode < j->edge->refnode )
+          return true;
+        else if ( i->edge->refnode > j->edge->refnode )
+          return false;
+      }
+    else if ( !i->edge && j->edge )
+      return true;
+    else if ( i->edge && !j->edge )
+      return false;
+    return ( i->loR < j->loR );
+  }
+};
+
+
+void PrintDiff(DeltaGraph_t & graph);
+void PrintBrk(const char* seq, long s, long e);
+void PrintSeqJmp(const char* seq,
+                 const char* seqp, const char* seqn,
+                 long s, long e);
+void PrintLisJmp(const char* seq, long s, long e);
+void PrintInv(const char* seq, long s, long e);
+void PrintGap(const char* seq, long s, long e, long gap1, long gap2);
+void PrintDup(const char* seq, long s, long e);
+void ParseArgs(int argc, char ** argv);
+void PrintHelp(const char * s);
+void PrintUsage(const char * s);
+
+
+//============================================================ Definitions ====
+//------------------------------------------------------------------- main ----
+int main(int argc, char **argv)
+{
+  DeltaGraph_t graph;
+
+  ParseArgs(argc, argv);
+
+  //-- Get M-to-M alignment, i.e. union of QLIS and RLIS
+  graph.build(OPT_AlignName, false);
+  graph.flagMtoM();
+  graph.clean();
+
+  //-- Output diff
+  PrintDiff(graph);
+
+  return EXIT_SUCCESS;
+}
+
+
+//-------------------------------------------------------------- PrintDiff ----
+void PrintDiff(DeltaGraph_t & graph)
+{
+  if ( OPT_PrintHeader )
+    {
+      printf ("%s %s\n%s\n\n",
+              graph . refpath . c_str( ), graph . qrypath . c_str( ),
+              graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER");
+      printf ("%s\t%s\t%s\t%s\t%s\n",
+              "[SEQ]", "[TYPE]", "[S1]", "[E1]", "[LEN 1]");
+    }
+
+  const char* refid;
+  const char* qryid;
+  long i,j;
+  long nAligns, gapR, gapQ;
+  DeltaEdgelet_t lpad, rpad;          // padding for the alignment vector
+  lpad.isRLIS = rpad.isRLIS = true;
+  lpad.isQLIS = rpad.isQLIS = true;
+  lpad.loR = lpad.hiR = lpad.loQ = lpad.hiQ = 0;
+
+  DeltaEdgelet_t *A, *PA, *PGA;       // alignment, prev, prev global
+  vector<DeltaEdgelet_t *> aligns;
+
+  map<string, DeltaNode_t>::const_iterator mi;
+  vector<DeltaEdge_t *>::const_iterator ei;
+  vector<DeltaEdgelet_t *>::iterator eli;
+
+  //-- For each reference sequence
+  if ( OPT_RefDiff )
+    for ( mi = graph.refnodes.begin(); mi != graph.refnodes.end(); ++ mi )
+      {
+        refid = mi->first.c_str();
+
+        //-- Collect all alignments for this reference sequence
+        aligns.clear();
+        for ( ei  = (mi->second).edges.begin();
+              ei != (mi->second).edges.end(); ++ei )
+          for ( eli  = (*ei)->edgelets.begin();
+                eli != (*ei)->edgelets.end(); ++eli )
+            aligns.push_back(*eli);
+
+        //-- Pad the front and back of the alignment vector
+        rpad.loR = rpad.hiR = mi->second.len + 1;
+        rpad.loQ = rpad.hiQ = LONG_MAX;
+        aligns.push_back(&lpad);
+        aligns.push_back(&rpad);
+
+        nAligns = aligns.size();
+
+        //-- OVERRIDE *stpc* value with loQ QLIS ordering
+        sort(aligns.begin(), aligns.end(), EdgeletIdQLoQCmp_t());
+        for ( i = 0, j = 0; i != nAligns; ++i )
+          aligns[i]->stpc = aligns[i]->isQLIS ? j++ : -1;
+
+        //-- Sort by reference order
+        sort(aligns.begin(), aligns.end(), EdgeletLoRCmp_t());
+        assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );
+
+        //-- Walk reference cover alignments, low to high
+        PA = PGA = aligns[0];
+        for ( i = 1; i != nAligns; ++i )
+          {
+            //-- Only interested in reference covering alignments
+            if ( !aligns[i]->isRLIS ) continue;
+
+            A = aligns[i];
+            gapR = A->loR - PA->hiR - 1;
+
+            //-- Reached end of alignments
+            if ( A->edge == NULL )
+              {
+                PrintBrk(refid, PA->hiR+1, A->loR-1);
+              }
+            //-- 1-to-1 alignment
+            else if ( A->isQLIS && A->edge == PGA->edge )
+              {
+                //-- Jump within Q
+                if ( A->slope() != PGA->slope() ||
+                     A->stpc != PGA->stpc + PGA->slope() )
+                  {
+                    if ( A->slope() == PGA->slope() )
+                      PrintLisJmp(refid, PA->hiR+1, A->loR-1);
+                    else
+                      PrintInv(refid, PA->hiR+1, A->loR-1);
+                  }
+                //-- Lined up, nothing in between
+                else if ( PA == PGA )
+                  {
+                    gapQ = A->isPositive() ?
+                      A->loQ - PGA->hiQ - 1 :
+                      PGA->loQ - A->hiQ - 1;
+                    PrintGap(refid, PA->hiR+1, A->loR-1, gapR, gapQ);
+                  }
+                //-- Lined up, duplication in between
+                else
+                  {
+                    PrintBrk(refid, PA->hiR+1, A->loR-1);
+                  }
+              }
+            //-- Not in QLIS? Must be a duplication in R
+            else if ( !A->isQLIS )
+              {
+                PrintBrk(refid, PA->hiR+1, A->loR-1);
+                PrintDup(refid, A->loR, A->hiR);
+              }
+            //-- A->edge != PGA->edge? Jump to different query sequence
+            else if ( PGA->edge != NULL )
+              {
+                PrintSeqJmp(refid,
+                            PGA->edge->qrynode->id->c_str(),
+                            A->edge->qrynode->id->c_str(),
+                            PA->hiR+1, A->loR-1);
+              }
+            //-- Gap before first alignment
+            else
+              {
+                PrintBrk(refid, PA->hiR+1, A->loR-1);
+              }
+
+            if ( A->isQLIS )
+              PGA = A;
+            PA = A;
+          }
+      }
+
+
+  //---------- WARNING! Same code as above but Q's for R's and R's for Q's
+  if ( OPT_QryDiff )
+    for ( mi = graph.qrynodes.begin(); mi != graph.qrynodes.end(); ++ mi )
+      {
+        qryid = mi->first.c_str();
+
+        aligns.clear();
+        for ( ei  = (mi->second).edges.begin();
+              ei != (mi->second).edges.end(); ++ei )
+          for ( eli  = (*ei)->edgelets.begin();
+                eli != (*ei)->edgelets.end(); ++eli )
+            aligns.push_back(*eli);
+
+        rpad.loQ = rpad.hiQ = mi->second.len + 1;
+        rpad.loR = rpad.hiR = LONG_MAX;
+        aligns.push_back(&lpad);
+        aligns.push_back(&rpad);
+
+        nAligns = aligns.size();
+
+        sort(aligns.begin(), aligns.end(), EdgeletIdRLoRCmp_t());
+        for ( i = 0, j = 0; i != nAligns; ++i )
+          aligns[i]->stpc = aligns[i]->isRLIS ? j++ : -1;
+
+        sort(aligns.begin(), aligns.end(), EdgeletLoQCmp_t());
+        assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );
+
+        PA = PGA = aligns[0];
+        for ( i = 1; i != nAligns; ++i )
+          {
+            if ( !aligns[i]->isQLIS ) continue;
+
+            A = aligns[i];
+            gapQ = A->loQ - PA->hiQ - 1;
+
+            if ( A->edge == NULL )
+              {
+                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
+              }
+            else if ( A->isRLIS && A->edge == PGA->edge )
+              {
+                if ( A->slope() != PGA->slope() ||
+                     A->stpc != PGA->stpc + PGA->slope() )
+                  {
+                    if ( A->slope() == PGA->slope() )
+                      PrintLisJmp(qryid, PA->hiQ+1, A->loQ-1);
+                    else
+                      PrintInv(qryid, PA->hiQ+1, A->loQ-1);
+                  }
+                else if ( PA == PGA )
+                  {
+                    gapR = A->isPositive() ?
+                      A->loR - PGA->hiR - 1 :
+                      PGA->loR - A->hiR - 1;
+                    PrintGap(qryid, PA->hiQ+1, A->loQ-1, gapQ, gapR);
+                  }
+                else
+                  {
+                    PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
+                  }
+              }
+            else if ( !A->isRLIS )
+              {
+                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
+                PrintDup(qryid, A->loQ, A->hiQ);
+              }
+            else if ( PGA->edge != NULL )
+              {
+                PrintSeqJmp(qryid,
+                            PA->edge->refnode->id->c_str(),
+                            A->edge->refnode->id->c_str(),
+                            PA->hiQ+1, A->loQ-1);
+              }
+            else
+              {
+                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
+              }
+
+            if ( A->isRLIS )
+              PGA = A;
+            PA = A;
+          }
+      }
+}
+
+
+void PrintBrk(const char* seq, long s, long e)
+{
+  if ( e-s+1 <= 0 ) return;
+
+  if ( !OPT_AMOS )
+    printf("%s\tBRK\t%ld\t%ld\t%ld\n",
+           seq, s, e, e-s+1);
+  else
+    printf("%s\tA\t%ld\t%ld\tBRK\t%ld\t%ld\t%ld\n",
+           seq, s, e, s, e, e-s+1);
+}
+
+void PrintSeqJmp(const char* seq,
+                 const char* seqp,
+                 const char* seqn,
+                 long s, long e)
+{
+  if ( !OPT_AMOS )
+    printf("%s\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
+           seq, s, e, e-s+1, seqp, seqn);
+  else
+    printf("%s\tA\t%ld\t%ld\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
+           seq, s, e, s, e, e-s+1, seqp, seqn);
+}
+
+void PrintLisJmp(const char* seq, long s, long e)
+{
+  if ( !OPT_AMOS )
+    printf("%s\tJMP\t%ld\t%ld\t%ld\n",
+           seq, s, e, e-s+1);
+  else
+    printf("%s\tA\t%ld\t%ld\tJMP\t%ld\t%ld\t%ld\n",
+           seq, s, e, s, e, e-s+1);
+}
+
+void PrintInv(const char* seq, long s, long e)
+{
+  if ( !OPT_AMOS )
+    printf("%s\tINV\t%ld\t%ld\t%ld\n",
+           seq, s, e, e-s+1);
+  else
+    printf("%s\tA\t%ld\t%ld\tINV\t%ld\t%ld\t%ld\n",
+           seq, s, e, s, e, e-s+1);
+}
+
+void PrintGap(const char* seq, long s, long e, long gap1, long gap2)
+{
+  if ( !OPT_AMOS )
+    printf("%s\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
+           seq, s, e, gap1, gap2, gap1-gap2);
+  else
+    printf("%s\tA\t%ld\t%ld\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
+           seq, s, e, s, e, gap1, gap2, gap1-gap2);
+}
+
+void PrintDup(const char* seq, long s, long e)
+{
+  if ( !OPT_AMOS )
+    printf("%s\tDUP\t%ld\t%ld\t%ld\n",
+           seq, s, e, e-s+1);
+  else
+    printf("%s\tA\t%ld\t%ld\tDUP\t%ld\t%ld\t%ld\n",
+           seq, s, e, s, e, e-s+1);
+}
+
+
+//-------------------------------------------------------------- ParseArgs ----
+void ParseArgs (int argc, char ** argv)
+{
+  int ch, errflg = 0;
+  optarg = NULL;
+
+  while ( !errflg  &&
+          ((ch = getopt (argc, argv, "fhHqr")) != EOF) )
+    switch (ch)
+      {
+      case 'f':
+        OPT_AMOS = true;
+        break;
+
+      case 'h':
+        PrintHelp (argv[0]);
+        exit (EXIT_SUCCESS);
+        break;
+
+      case 'H':
+        OPT_PrintHeader = false;
+        break;
+
+      case 'q':
+        OPT_QryDiff = true;
+        break;
+        
+      case 'r':
+        OPT_RefDiff = true;
+        break;
+
+      default:
+        errflg ++;
+      }
+
+  if ( OPT_RefDiff && OPT_QryDiff )
+    {
+      cerr << "ERROR: -r and -q options cannot be combined\n";
+      errflg++;
+    }
+  if ( !OPT_RefDiff && !OPT_QryDiff )
+    OPT_RefDiff = true;
+
+  if ( errflg > 0  ||  optind != argc - 1 )
+    {
+      PrintUsage(argv[0]);
+      cerr << "Try '" << argv[0] << " -h' for more information.\n";
+      exit(EXIT_FAILURE);
+    }
+
+  OPT_AlignName = argv [optind ++];
+}
+
+
+//-------------------------------------------------------------- PrintHelp ----
+void PrintHelp (const char * s)
+{
+  PrintUsage (s);
+  cerr
+    << "-f            Output diff information as AMOS features\n"
+    << "-h            Display help information\n"
+    << "-H            Do not show header\n"
+    << "-q            Show diff information for queries\n"
+    << "-r            Show diff information for references (default)\n"
+    << endl;
+
+  cerr
+    << "  Outputs a list of structural differences for each sequence in\n"
+    << "the reference and query, sorted by position. For a reference\n"
+    << "sequence R, and its matching query sequence Q, differences are\n"
+    << "categorized as GAP (gap between two mutually consistent alignments),\n"
+    << "DUP (inserted duplication), BRK (other inserted sequence), JMP\n"
+    << "(rearrangement), INV (rearrangement with inversion), SEQ\n"
+    << "(rearrangement with another sequence). The first five columns of\n"
+    << "the output are seq ID, feature type, feature start, feature end,\n"
+    << "and feature length. Additional columns are added depending on the\n"
+    << "feature type. Negative feature lengths indicate overlapping adjacent\n"
+    << "alignment blocks.\n"
+    << "  IDR GAP gap-start gap-end gap-length-R gap-length-Q gap-diff\n"
+    << "  IDR DUP dup-start dup-end dup-length\n"
+    << "  IDR BRK gap-start gap-end gap-length\n"
+    << "  IDR JMP gap-start gap-end gap-length\n"
+    << "  IDR INV gap-start gap-end gap-length\n"
+    << "  IDR SEQ gap-start gap-end gap-length prev-sequence next-sequence\n"
+    << "Positions always reference the sequence with the given ID. The\n"
+    << "sum of the fifth column (ignoring negative values) is the total\n"
+    << "amount of inserted sequence. Summing the fifth column after removing\n"
+    << "DUP features is total unique inserted sequence. Note that unaligned\n"
+    << "sequence are not counted, and could represent additional \"unique\"\n"
+    << "sequences. See documentation for tips on how to interpret these\n"
+    << "alignment break features.\n"
+    << endl;
+
+  return;
+}
+
+
+//------------------------------------------------------------- PrintUsage ----
+void PrintUsage (const char * s)
+{
+  cerr
+    << "\nUSAGE: " << s << "  [options]  <deltafile>\n\n";
+  return;
+}