view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/show-diff.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line source
//-----------------------------------------------------------------------------
//   Programmer: Adam M Phillippy, University of Maryland
//         File: show-diff.cc
//         Date: 12 / 01 / 2006
//
//        Usage: show-diff [options] <deltafile>
//               Try 'show-diff -h' for more information
//
//-----------------------------------------------------------------------------

#include "delta.hh"
#include "tigrinc.hh"
#include <string>
#include <cstdlib>
#include <cassert>
#include <climits>
#include <algorithm>
#include <vector>

using namespace std;


//================================================================ Options ====
string  OPT_AlignName;            // delta file name
bool    OPT_AMOS    = false;      // AMOS output
bool    OPT_RefDiff = false;      // output reference diff
bool    OPT_QryDiff = false;      // output query diff
bool    OPT_PrintHeader = true;   // -H option


//=========================================================== Declarations ====
struct EdgeletLoQCmp_t
//!< Sorts query by lo coord, lo to hi
{
  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
  { return ( i->loQ < j->loQ ); }
};

struct EdgeletIdQLoQCmp_t
//!< Sorts query by ID and lo coord, lo to hi
{
  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
  {
    if ( i->edge && j->edge )
      {
        if ( i->edge->qrynode < j->edge->qrynode )
          return true;
        else if ( i->edge->qrynode > j->edge->qrynode )
          return false;
      }
    else if ( !i->edge && j->edge )
      return true;
    else if ( i->edge && !j->edge )
      return false;
    return ( i->loQ < j->loQ );
  }
};

struct EdgeletLoRCmp_t
//!< Sorts reference by lo coord, lo to hi
{
  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
  { return ( i->loR < j->loR ); }
};

struct EdgeletIdRLoRCmp_t
//!< Sorts reference by ID and lo coord, lo to hi
{
  bool operator()(const DeltaEdgelet_t * i, const DeltaEdgelet_t * j) const
  {
    if ( i->edge && j->edge )
      {
        if ( i->edge->refnode < j->edge->refnode )
          return true;
        else if ( i->edge->refnode > j->edge->refnode )
          return false;
      }
    else if ( !i->edge && j->edge )
      return true;
    else if ( i->edge && !j->edge )
      return false;
    return ( i->loR < j->loR );
  }
};


void PrintDiff(DeltaGraph_t & graph);
void PrintBrk(const char* seq, long s, long e);
void PrintSeqJmp(const char* seq,
                 const char* seqp, const char* seqn,
                 long s, long e);
void PrintLisJmp(const char* seq, long s, long e);
void PrintInv(const char* seq, long s, long e);
void PrintGap(const char* seq, long s, long e, long gap1, long gap2);
void PrintDup(const char* seq, long s, long e);
void ParseArgs(int argc, char ** argv);
void PrintHelp(const char * s);
void PrintUsage(const char * s);


//============================================================ Definitions ====
//------------------------------------------------------------------- main ----
int main(int argc, char **argv)
{
  DeltaGraph_t graph;

  ParseArgs(argc, argv);

  //-- Get M-to-M alignment, i.e. union of QLIS and RLIS
  graph.build(OPT_AlignName, false);
  graph.flagMtoM();
  graph.clean();

  //-- Output diff
  PrintDiff(graph);

  return EXIT_SUCCESS;
}


//-------------------------------------------------------------- PrintDiff ----
void PrintDiff(DeltaGraph_t & graph)
{
  if ( OPT_PrintHeader )
    {
      printf ("%s %s\n%s\n\n",
              graph . refpath . c_str( ), graph . qrypath . c_str( ),
              graph . datatype == NUCMER_DATA ? "NUCMER" : "PROMER");
      printf ("%s\t%s\t%s\t%s\t%s\n",
              "[SEQ]", "[TYPE]", "[S1]", "[E1]", "[LEN 1]");
    }

  const char* refid;
  const char* qryid;
  long i,j;
  long nAligns, gapR, gapQ;
  DeltaEdgelet_t lpad, rpad;          // padding for the alignment vector
  lpad.isRLIS = rpad.isRLIS = true;
  lpad.isQLIS = rpad.isQLIS = true;
  lpad.loR = lpad.hiR = lpad.loQ = lpad.hiQ = 0;

  DeltaEdgelet_t *A, *PA, *PGA;       // alignment, prev, prev global
  vector<DeltaEdgelet_t *> aligns;

  map<string, DeltaNode_t>::const_iterator mi;
  vector<DeltaEdge_t *>::const_iterator ei;
  vector<DeltaEdgelet_t *>::iterator eli;

  //-- For each reference sequence
  if ( OPT_RefDiff )
    for ( mi = graph.refnodes.begin(); mi != graph.refnodes.end(); ++ mi )
      {
        refid = mi->first.c_str();

        //-- Collect all alignments for this reference sequence
        aligns.clear();
        for ( ei  = (mi->second).edges.begin();
              ei != (mi->second).edges.end(); ++ei )
          for ( eli  = (*ei)->edgelets.begin();
                eli != (*ei)->edgelets.end(); ++eli )
            aligns.push_back(*eli);

        //-- Pad the front and back of the alignment vector
        rpad.loR = rpad.hiR = mi->second.len + 1;
        rpad.loQ = rpad.hiQ = LONG_MAX;
        aligns.push_back(&lpad);
        aligns.push_back(&rpad);

        nAligns = aligns.size();

        //-- OVERRIDE *stpc* value with loQ QLIS ordering
        sort(aligns.begin(), aligns.end(), EdgeletIdQLoQCmp_t());
        for ( i = 0, j = 0; i != nAligns; ++i )
          aligns[i]->stpc = aligns[i]->isQLIS ? j++ : -1;

        //-- Sort by reference order
        sort(aligns.begin(), aligns.end(), EdgeletLoRCmp_t());
        assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );

        //-- Walk reference cover alignments, low to high
        PA = PGA = aligns[0];
        for ( i = 1; i != nAligns; ++i )
          {
            //-- Only interested in reference covering alignments
            if ( !aligns[i]->isRLIS ) continue;

            A = aligns[i];
            gapR = A->loR - PA->hiR - 1;

            //-- Reached end of alignments
            if ( A->edge == NULL )
              {
                PrintBrk(refid, PA->hiR+1, A->loR-1);
              }
            //-- 1-to-1 alignment
            else if ( A->isQLIS && A->edge == PGA->edge )
              {
                //-- Jump within Q
                if ( A->slope() != PGA->slope() ||
                     A->stpc != PGA->stpc + PGA->slope() )
                  {
                    if ( A->slope() == PGA->slope() )
                      PrintLisJmp(refid, PA->hiR+1, A->loR-1);
                    else
                      PrintInv(refid, PA->hiR+1, A->loR-1);
                  }
                //-- Lined up, nothing in between
                else if ( PA == PGA )
                  {
                    gapQ = A->isPositive() ?
                      A->loQ - PGA->hiQ - 1 :
                      PGA->loQ - A->hiQ - 1;
                    PrintGap(refid, PA->hiR+1, A->loR-1, gapR, gapQ);
                  }
                //-- Lined up, duplication in between
                else
                  {
                    PrintBrk(refid, PA->hiR+1, A->loR-1);
                  }
              }
            //-- Not in QLIS? Must be a duplication in R
            else if ( !A->isQLIS )
              {
                PrintBrk(refid, PA->hiR+1, A->loR-1);
                PrintDup(refid, A->loR, A->hiR);
              }
            //-- A->edge != PGA->edge? Jump to different query sequence
            else if ( PGA->edge != NULL )
              {
                PrintSeqJmp(refid,
                            PGA->edge->qrynode->id->c_str(),
                            A->edge->qrynode->id->c_str(),
                            PA->hiR+1, A->loR-1);
              }
            //-- Gap before first alignment
            else
              {
                PrintBrk(refid, PA->hiR+1, A->loR-1);
              }

            if ( A->isQLIS )
              PGA = A;
            PA = A;
          }
      }


  //---------- WARNING! Same code as above but Q's for R's and R's for Q's
  if ( OPT_QryDiff )
    for ( mi = graph.qrynodes.begin(); mi != graph.qrynodes.end(); ++ mi )
      {
        qryid = mi->first.c_str();

        aligns.clear();
        for ( ei  = (mi->second).edges.begin();
              ei != (mi->second).edges.end(); ++ei )
          for ( eli  = (*ei)->edgelets.begin();
                eli != (*ei)->edgelets.end(); ++eli )
            aligns.push_back(*eli);

        rpad.loQ = rpad.hiQ = mi->second.len + 1;
        rpad.loR = rpad.hiR = LONG_MAX;
        aligns.push_back(&lpad);
        aligns.push_back(&rpad);

        nAligns = aligns.size();

        sort(aligns.begin(), aligns.end(), EdgeletIdRLoRCmp_t());
        for ( i = 0, j = 0; i != nAligns; ++i )
          aligns[i]->stpc = aligns[i]->isRLIS ? j++ : -1;

        sort(aligns.begin(), aligns.end(), EdgeletLoQCmp_t());
        assert ( aligns[0] == &lpad && aligns[nAligns-1] == &rpad );

        PA = PGA = aligns[0];
        for ( i = 1; i != nAligns; ++i )
          {
            if ( !aligns[i]->isQLIS ) continue;

            A = aligns[i];
            gapQ = A->loQ - PA->hiQ - 1;

            if ( A->edge == NULL )
              {
                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
              }
            else if ( A->isRLIS && A->edge == PGA->edge )
              {
                if ( A->slope() != PGA->slope() ||
                     A->stpc != PGA->stpc + PGA->slope() )
                  {
                    if ( A->slope() == PGA->slope() )
                      PrintLisJmp(qryid, PA->hiQ+1, A->loQ-1);
                    else
                      PrintInv(qryid, PA->hiQ+1, A->loQ-1);
                  }
                else if ( PA == PGA )
                  {
                    gapR = A->isPositive() ?
                      A->loR - PGA->hiR - 1 :
                      PGA->loR - A->hiR - 1;
                    PrintGap(qryid, PA->hiQ+1, A->loQ-1, gapQ, gapR);
                  }
                else
                  {
                    PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
                  }
              }
            else if ( !A->isRLIS )
              {
                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
                PrintDup(qryid, A->loQ, A->hiQ);
              }
            else if ( PGA->edge != NULL )
              {
                PrintSeqJmp(qryid,
                            PA->edge->refnode->id->c_str(),
                            A->edge->refnode->id->c_str(),
                            PA->hiQ+1, A->loQ-1);
              }
            else
              {
                PrintBrk(qryid, PA->hiQ+1, A->loQ-1);
              }

            if ( A->isRLIS )
              PGA = A;
            PA = A;
          }
      }
}


void PrintBrk(const char* seq, long s, long e)
{
  if ( e-s+1 <= 0 ) return;

  if ( !OPT_AMOS )
    printf("%s\tBRK\t%ld\t%ld\t%ld\n",
           seq, s, e, e-s+1);
  else
    printf("%s\tA\t%ld\t%ld\tBRK\t%ld\t%ld\t%ld\n",
           seq, s, e, s, e, e-s+1);
}

void PrintSeqJmp(const char* seq,
                 const char* seqp,
                 const char* seqn,
                 long s, long e)
{
  if ( !OPT_AMOS )
    printf("%s\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
           seq, s, e, e-s+1, seqp, seqn);
  else
    printf("%s\tA\t%ld\t%ld\tSEQ\t%ld\t%ld\t%ld\t%s\t%s\n",
           seq, s, e, s, e, e-s+1, seqp, seqn);
}

void PrintLisJmp(const char* seq, long s, long e)
{
  if ( !OPT_AMOS )
    printf("%s\tJMP\t%ld\t%ld\t%ld\n",
           seq, s, e, e-s+1);
  else
    printf("%s\tA\t%ld\t%ld\tJMP\t%ld\t%ld\t%ld\n",
           seq, s, e, s, e, e-s+1);
}

void PrintInv(const char* seq, long s, long e)
{
  if ( !OPT_AMOS )
    printf("%s\tINV\t%ld\t%ld\t%ld\n",
           seq, s, e, e-s+1);
  else
    printf("%s\tA\t%ld\t%ld\tINV\t%ld\t%ld\t%ld\n",
           seq, s, e, s, e, e-s+1);
}

void PrintGap(const char* seq, long s, long e, long gap1, long gap2)
{
  if ( !OPT_AMOS )
    printf("%s\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
           seq, s, e, gap1, gap2, gap1-gap2);
  else
    printf("%s\tA\t%ld\t%ld\tGAP\t%ld\t%ld\t%ld\t%ld\t%ld\n",
           seq, s, e, s, e, gap1, gap2, gap1-gap2);
}

void PrintDup(const char* seq, long s, long e)
{
  if ( !OPT_AMOS )
    printf("%s\tDUP\t%ld\t%ld\t%ld\n",
           seq, s, e, e-s+1);
  else
    printf("%s\tA\t%ld\t%ld\tDUP\t%ld\t%ld\t%ld\n",
           seq, s, e, s, e, e-s+1);
}


//-------------------------------------------------------------- ParseArgs ----
void ParseArgs (int argc, char ** argv)
{
  int ch, errflg = 0;
  optarg = NULL;

  while ( !errflg  &&
          ((ch = getopt (argc, argv, "fhHqr")) != EOF) )
    switch (ch)
      {
      case 'f':
        OPT_AMOS = true;
        break;

      case 'h':
        PrintHelp (argv[0]);
        exit (EXIT_SUCCESS);
        break;

      case 'H':
        OPT_PrintHeader = false;
        break;

      case 'q':
        OPT_QryDiff = true;
        break;
        
      case 'r':
        OPT_RefDiff = true;
        break;

      default:
        errflg ++;
      }

  if ( OPT_RefDiff && OPT_QryDiff )
    {
      cerr << "ERROR: -r and -q options cannot be combined\n";
      errflg++;
    }
  if ( !OPT_RefDiff && !OPT_QryDiff )
    OPT_RefDiff = true;

  if ( errflg > 0  ||  optind != argc - 1 )
    {
      PrintUsage(argv[0]);
      cerr << "Try '" << argv[0] << " -h' for more information.\n";
      exit(EXIT_FAILURE);
    }

  OPT_AlignName = argv [optind ++];
}


//-------------------------------------------------------------- PrintHelp ----
void PrintHelp (const char * s)
{
  PrintUsage (s);
  cerr
    << "-f            Output diff information as AMOS features\n"
    << "-h            Display help information\n"
    << "-H            Do not show header\n"
    << "-q            Show diff information for queries\n"
    << "-r            Show diff information for references (default)\n"
    << endl;

  cerr
    << "  Outputs a list of structural differences for each sequence in\n"
    << "the reference and query, sorted by position. For a reference\n"
    << "sequence R, and its matching query sequence Q, differences are\n"
    << "categorized as GAP (gap between two mutually consistent alignments),\n"
    << "DUP (inserted duplication), BRK (other inserted sequence), JMP\n"
    << "(rearrangement), INV (rearrangement with inversion), SEQ\n"
    << "(rearrangement with another sequence). The first five columns of\n"
    << "the output are seq ID, feature type, feature start, feature end,\n"
    << "and feature length. Additional columns are added depending on the\n"
    << "feature type. Negative feature lengths indicate overlapping adjacent\n"
    << "alignment blocks.\n"
    << "  IDR GAP gap-start gap-end gap-length-R gap-length-Q gap-diff\n"
    << "  IDR DUP dup-start dup-end dup-length\n"
    << "  IDR BRK gap-start gap-end gap-length\n"
    << "  IDR JMP gap-start gap-end gap-length\n"
    << "  IDR INV gap-start gap-end gap-length\n"
    << "  IDR SEQ gap-start gap-end gap-length prev-sequence next-sequence\n"
    << "Positions always reference the sequence with the given ID. The\n"
    << "sum of the fifth column (ignoring negative values) is the total\n"
    << "amount of inserted sequence. Summing the fifth column after removing\n"
    << "DUP features is total unique inserted sequence. Note that unaligned\n"
    << "sequence are not counted, and could represent additional \"unique\"\n"
    << "sequences. See documentation for tips on how to interpret these\n"
    << "alignment break features.\n"
    << endl;

  return;
}


//------------------------------------------------------------- PrintUsage ----
void PrintUsage (const char * s)
{
  cerr
    << "\nUSAGE: " << s << "  [options]  <deltafile>\n\n";
  return;
}