Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,243 @@ +//------------------------------------------------------------------------------ +// Programmer: Adam M Phillippy, The Institute for Genomic Research +// File: prepro.cc +// Date: 08 / 16 / 2002 +// +// Input: Input is a single multi-FASTA sequence file on the command +// line, the command line switch '-r' specifies that the input +// is the reference sequence. This switch tells the program to +// suppress all but the first header and use a different masking +// character than the query sequence. If it is the query sequence +// being input, use the '-q' option. Either -r or -q must be specified. +// +// Output: Output is to stdout, and consists of each sequence in the input +// translated into all six reading frames. All the translations +// for a particular sequence are appended together and each +// seperated by a masking character. If the '-r' flag is specified, +// all but the top header are removed from the output, if the '-q' +// flag is specified each sequence header is left in place but the +// translated frames remain concatenated together without headers. +// In addition to translating the sequences, prepro also does +// a simple masking step that masks amino acids that are +// surrounded on either side with stop codons. The '-m len' option +// allows the user to specify the maximum length of these regions +// to be masked. For example, if '-m 5' were set and the sequence +// "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and +// Len+1 indices (although non-existent) are considered as stop codons +// so this masking with -m 5 could turn the beginning of a sequence +// "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0". +// +// Usage: prepro [-m len] -r/-q <multi-FASTA> +// +//------------------------------------------------------------------------------ + +#include "tigrinc.hh" +#include "translate.hh" + +//-- Output this many sequence characters per line +#define CHARS_PER_LINE 60 + +const long int DEFAULT_MASK_LEN = 10; + +const char TRANSLATE_MASK = 'X'; // translator masking character +const char REFERENCE_MASK = 'X'; // masking character to use for reference +const char QUERY_MASK = 'O'; // masking character to use for query +const char STOP_MASK = 'J'; // alpha character for stop codons +const char STOP_CHAR = '*'; // stop codon character + +inline void mask + (char * A, char mask_ch, long int x, long int y); + +void printHelp + (const char *); + +void printUsage + (const char *); + + + + +int main + (int argc, char * argv[]) +{ + bool isReference = false; + bool isQuery = false; + + int frame; + int mask_len = DEFAULT_MASK_LEN; + + long int InitSize, LenA, LentA, ct, i; + long int last_index; + + char * A, * tA; + char mask_char = 0; + char Id [MAX_LINE]; + char InputFileName [MAX_LINE]; + + FILE * InputFile; + + //-- Parse the command line arguments + { + optarg = NULL; + int ch, errflg = 0; + while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) ) + switch (ch) + { + case 'h' : + printHelp (argv[0]); + exit (EXIT_SUCCESS); + break; + + case 'm' : + mask_len = atoi (optarg); + break; + + case 'q' : + strcpy (InputFileName, optarg); + isQuery = true; + mask_char = QUERY_MASK; + break; + + case 'r' : + strcpy (InputFileName, optarg); + isReference = true; + mask_char = REFERENCE_MASK; + break; + + default : + errflg ++; + } + + if ( errflg > 0 || argc - optind != 0 || + (isReference && isQuery) || (!isReference && !isQuery) ) + { + printUsage (argv[0]); + exit (EXIT_FAILURE); + } + + if ( mask_len < 0 ) + { + fprintf (stderr, + "WARNING: Invalid maximum mask length %d, ignored\n", mask_len); + mask_len = DEFAULT_MASK_LEN; + } + } + + InputFile = File_Open (InputFileName, "r"); + + InitSize = INIT_SIZE; + A = (char *) Safe_malloc ( sizeof(char) * InitSize ); + tA = (char *) Safe_malloc ( sizeof(char) ); + tA [0] = '\0'; + + ct = 0; + if ( isReference ) + printf (">allcontigs %s\n", InputFileName); + while ( Read_String (InputFile, A, InitSize, Id, FALSE) ) + { + LenA = strlen(A + 1); + + for ( frame = 1; frame <= 6; frame ++ ) + { + if ( isQuery ) + printf (">%s.%d\n", Id, frame); + + //-- Translate the current frame + tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) ); + LentA = Translate_DNA (A, tA, frame); + tA[++ LentA] = mask_char; + + //-- Mask the current frame + last_index = 0; + for ( i = 1; i <= LentA; i ++ ) + { + if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK ) + tA[i] = mask_char; + else if ( tA[i] == STOP_CHAR ) + { + tA[i] = STOP_MASK; + if ( i - last_index - 1 <= mask_len ) + mask (tA, mask_char, last_index + 1, i - 1); + last_index = i; + } + } + if ( LentA - last_index - 1 <= mask_len ) + mask (tA, mask_char, last_index + 1, i - 1); + + //-- Print the current frame + for ( i = 1; i <= LentA; i ++ ) + { + fputc (tA[i], stdout); + if ( ++ ct == CHARS_PER_LINE ) + { + ct = 0; + fputc ('\n', stdout); + } + } + + if ( isQuery ) + { + if ( ct != 0 ) + fputc ('\n', stdout); + ct = 0; + } + } + } + if ( ct != 0 ) + fputc ('\n', stdout); + + fclose(InputFile); + + free(A); + free(tA); + + return EXIT_SUCCESS; +} + + + + +inline void mask + (char * A, char mask_ch, long int x, long int y) + + // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive) + +{ + for ( ; x <= y; x ++ ) + A[x] = mask_ch; +} + + + + +void printHelp + (const char * s) +{ + fprintf(stderr, + "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s); + fprintf(stderr, + "-h display help information\n" + "-m len set maximum book-end masking length to 'len\n" + "-q query input is the multi-fasta query file 'query'\n" + "-r reference input is the multi-fasta reference file 'reference'\n\n" + " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n" + "'-q query'. Both are not allowed.\n" + " Output is to stdout, and it consists of each sequence in the\n" + "FASTA file translated in all six reading frames. This output is\n" + "different depending on whether the the input was the reference\n" + "or query sequence, and it is now ready to be passed to 'mummer2'\n" + "for the match finding step.\n\n"); + return; +} + + + + +void printUsage + (const char * s) +{ + fprintf(stderr, + "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s); + fprintf (stderr, "Try '%s -h' for more information.\n", s); + return; +}