diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,243 @@
+//------------------------------------------------------------------------------
+//   Programmer: Adam M Phillippy, The Institute for Genomic Research
+//         File: prepro.cc
+//         Date: 08 / 16 / 2002
+//
+//        Input: Input is a single multi-FASTA sequence file on the command
+//              line, the command line switch '-r' specifies that the input
+//             is the reference sequence. This switch tells the program to
+//            suppress all but the first header and use a different masking
+//           character than the query sequence. If it is the query sequence
+//          being input, use the '-q' option. Either -r or -q must be specified.
+//
+//       Output: Output is to stdout, and consists of each sequence in the input
+//              translated into all six reading frames. All the translations
+//             for a particular sequence are appended together and each
+//            seperated by a masking character. If the '-r' flag is specified,
+//           all but the top header are removed from the output, if the '-q'
+//          flag is specified each sequence header is left in place but the
+//         translated frames remain concatenated together without headers.
+//               In addition to translating the sequences, prepro also does
+//              a simple masking step that masks amino acids that are
+//             surrounded on either side with stop codons. The '-m len' option
+//            allows the user to specify the maximum length of these regions
+//           to be masked. For example, if '-m 5' were set and the sequence
+//          "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and
+//         Len+1 indices (although non-existent) are considered as stop codons
+//        so this masking with -m 5 could turn the beginning of a sequence
+//       "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0".
+//
+//        Usage: prepro [-m len] -r/-q <multi-FASTA>
+//
+//------------------------------------------------------------------------------
+
+#include "tigrinc.hh"
+#include "translate.hh"
+
+//-- Output this many sequence characters per line
+#define CHARS_PER_LINE 60
+
+const long int DEFAULT_MASK_LEN = 10;
+
+const char TRANSLATE_MASK = 'X';     // translator masking character
+const char REFERENCE_MASK = 'X';     // masking character to use for reference
+const char QUERY_MASK = 'O';         // masking character to use for query
+const char STOP_MASK = 'J';          // alpha character for stop codons
+const char STOP_CHAR = '*';          // stop codon character
+
+inline void mask
+     (char * A, char mask_ch, long int x, long int y);
+
+void printHelp
+     (const char *);
+
+void printUsage
+     (const char *);
+
+
+
+
+int main
+     (int argc, char * argv[])
+{
+  bool isReference = false;
+  bool isQuery = false;
+
+  int frame;
+  int mask_len = DEFAULT_MASK_LEN;
+ 
+  long int InitSize, LenA, LentA, ct, i;
+  long int last_index;
+
+  char * A, * tA;
+  char mask_char = 0;
+  char Id [MAX_LINE];
+  char InputFileName [MAX_LINE];
+
+  FILE * InputFile;
+
+  //-- Parse the command line arguments
+  {
+    optarg = NULL;
+    int ch, errflg = 0;
+    while ( !errflg  &&  ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) )
+      switch (ch)
+        {
+        case 'h' :
+          printHelp (argv[0]);
+          exit (EXIT_SUCCESS);
+          break;
+
+	case 'm' :
+	  mask_len = atoi (optarg);
+	  break;
+
+	case 'q' :
+	  strcpy (InputFileName, optarg);
+	  isQuery = true;
+	  mask_char = QUERY_MASK;
+	  break;
+
+	case 'r' :
+	  strcpy (InputFileName, optarg);
+	  isReference = true;
+	  mask_char = REFERENCE_MASK;
+	  break;
+
+        default :
+          errflg ++;
+        }
+    
+    if ( errflg > 0 || argc - optind != 0 ||
+	 (isReference  &&  isQuery)  ||  (!isReference  &&  !isQuery) )
+      {
+        printUsage (argv[0]);
+        exit (EXIT_FAILURE);
+      }
+
+    if ( mask_len < 0 )
+      {
+	fprintf (stderr,
+	  "WARNING: Invalid maximum mask length %d, ignored\n", mask_len);
+	mask_len = DEFAULT_MASK_LEN;
+      }
+  }
+
+  InputFile = File_Open (InputFileName, "r");
+
+  InitSize = INIT_SIZE;
+  A = (char *) Safe_malloc ( sizeof(char) * InitSize );
+  tA = (char *) Safe_malloc ( sizeof(char) );
+  tA [0] = '\0';
+  
+  ct = 0;
+  if ( isReference )
+    printf (">allcontigs %s\n", InputFileName);
+  while ( Read_String (InputFile, A, InitSize, Id, FALSE) )
+    {
+      LenA = strlen(A + 1);
+
+      for ( frame = 1; frame <= 6; frame ++ )
+	{
+	  if ( isQuery )
+	    printf (">%s.%d\n", Id, frame);
+
+	  //-- Translate the current frame
+	  tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) );
+	  LentA = Translate_DNA (A, tA, frame);
+	  tA[++ LentA] = mask_char;
+	  
+	  //-- Mask the current frame
+	  last_index = 0;
+	  for ( i = 1; i <= LentA; i ++ )
+	    {
+	      if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK )
+		tA[i] = mask_char;
+	      else if ( tA[i] == STOP_CHAR )
+		{
+		  tA[i] = STOP_MASK;
+		  if ( i - last_index - 1 <= mask_len )
+		    mask (tA, mask_char, last_index + 1, i - 1);
+		  last_index = i;
+		}
+	    }
+	  if ( LentA - last_index - 1 <= mask_len )
+	    mask (tA, mask_char, last_index + 1, i - 1);
+	  
+	  //-- Print the current frame
+	  for ( i = 1; i <= LentA; i ++ )
+	    {
+	      fputc (tA[i], stdout);
+	      if ( ++ ct == CHARS_PER_LINE )
+		{
+		  ct = 0;
+		  fputc ('\n', stdout);
+		}
+	    }
+
+	  if ( isQuery )
+	    {
+	      if ( ct != 0 )
+		fputc ('\n', stdout);
+	      ct = 0;
+	    }
+	}
+    }
+  if ( ct != 0 )
+    fputc ('\n', stdout);
+
+  fclose(InputFile);
+
+  free(A);
+  free(tA);
+
+  return EXIT_SUCCESS;
+}
+
+
+
+
+inline void mask
+     (char * A, char mask_ch, long int x, long int y)
+
+  //  Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive)
+
+{
+  for ( ; x <= y; x ++ )
+    A[x] = mask_ch;
+}
+
+
+
+
+void printHelp
+     (const char * s)
+{
+  fprintf(stderr,
+      "\nUSAGE: %s  [options]  -r/-q <fasta>\n\n", s);
+  fprintf(stderr,
+      "-h            display help information\n"
+      "-m len        set maximum book-end masking length to 'len\n" 
+      "-q query      input is the multi-fasta query file 'query'\n"
+      "-r reference  input is the multi-fasta reference file 'reference'\n\n"
+      "  Input is one multi-fasta sequence file, EITHER '-r reference' OR\n"
+      "'-q query'. Both are not allowed.\n"
+      "  Output is to stdout, and it consists of each sequence in the\n"
+      "FASTA file translated in all six reading frames. This output is\n"
+      "different depending on whether the the input was the reference\n"
+      "or query sequence, and it is now ready to be passed to 'mummer2'\n"
+      "for the match finding step.\n\n");
+  return;
+}
+
+
+
+
+void printUsage
+     (const char * s)
+{
+  fprintf(stderr,
+      "\nUSAGE: %s  [options]  -r/-q <fasta>\n\n", s);
+  fprintf (stderr, "Try '%s -h' for more information.\n", s);
+  return;
+}