jpayne@69: //------------------------------------------------------------------------------ jpayne@69: // Programmer: Adam M Phillippy, The Institute for Genomic Research jpayne@69: // File: prepro.cc jpayne@69: // Date: 08 / 16 / 2002 jpayne@69: // jpayne@69: // Input: Input is a single multi-FASTA sequence file on the command jpayne@69: // line, the command line switch '-r' specifies that the input jpayne@69: // is the reference sequence. This switch tells the program to jpayne@69: // suppress all but the first header and use a different masking jpayne@69: // character than the query sequence. If it is the query sequence jpayne@69: // being input, use the '-q' option. Either -r or -q must be specified. jpayne@69: // jpayne@69: // Output: Output is to stdout, and consists of each sequence in the input jpayne@69: // translated into all six reading frames. All the translations jpayne@69: // for a particular sequence are appended together and each jpayne@69: // seperated by a masking character. If the '-r' flag is specified, jpayne@69: // all but the top header are removed from the output, if the '-q' jpayne@69: // flag is specified each sequence header is left in place but the jpayne@69: // translated frames remain concatenated together without headers. jpayne@69: // In addition to translating the sequences, prepro also does jpayne@69: // a simple masking step that masks amino acids that are jpayne@69: // surrounded on either side with stop codons. The '-m len' option jpayne@69: // allows the user to specify the maximum length of these regions jpayne@69: // to be masked. For example, if '-m 5' were set and the sequence jpayne@69: // "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and jpayne@69: // Len+1 indices (although non-existent) are considered as stop codons jpayne@69: // so this masking with -m 5 could turn the beginning of a sequence jpayne@69: // "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0". jpayne@69: // jpayne@69: // Usage: prepro [-m len] -r/-q jpayne@69: // jpayne@69: //------------------------------------------------------------------------------ jpayne@69: jpayne@69: #include "tigrinc.hh" jpayne@69: #include "translate.hh" jpayne@69: jpayne@69: //-- Output this many sequence characters per line jpayne@69: #define CHARS_PER_LINE 60 jpayne@69: jpayne@69: const long int DEFAULT_MASK_LEN = 10; jpayne@69: jpayne@69: const char TRANSLATE_MASK = 'X'; // translator masking character jpayne@69: const char REFERENCE_MASK = 'X'; // masking character to use for reference jpayne@69: const char QUERY_MASK = 'O'; // masking character to use for query jpayne@69: const char STOP_MASK = 'J'; // alpha character for stop codons jpayne@69: const char STOP_CHAR = '*'; // stop codon character jpayne@69: jpayne@69: inline void mask jpayne@69: (char * A, char mask_ch, long int x, long int y); jpayne@69: jpayne@69: void printHelp jpayne@69: (const char *); jpayne@69: jpayne@69: void printUsage jpayne@69: (const char *); jpayne@69: jpayne@69: jpayne@69: jpayne@69: jpayne@69: int main jpayne@69: (int argc, char * argv[]) jpayne@69: { jpayne@69: bool isReference = false; jpayne@69: bool isQuery = false; jpayne@69: jpayne@69: int frame; jpayne@69: int mask_len = DEFAULT_MASK_LEN; jpayne@69: jpayne@69: long int InitSize, LenA, LentA, ct, i; jpayne@69: long int last_index; jpayne@69: jpayne@69: char * A, * tA; jpayne@69: char mask_char = 0; jpayne@69: char Id [MAX_LINE]; jpayne@69: char InputFileName [MAX_LINE]; jpayne@69: jpayne@69: FILE * InputFile; jpayne@69: jpayne@69: //-- Parse the command line arguments jpayne@69: { jpayne@69: optarg = NULL; jpayne@69: int ch, errflg = 0; jpayne@69: while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) ) jpayne@69: switch (ch) jpayne@69: { jpayne@69: case 'h' : jpayne@69: printHelp (argv[0]); jpayne@69: exit (EXIT_SUCCESS); jpayne@69: break; jpayne@69: jpayne@69: case 'm' : jpayne@69: mask_len = atoi (optarg); jpayne@69: break; jpayne@69: jpayne@69: case 'q' : jpayne@69: strcpy (InputFileName, optarg); jpayne@69: isQuery = true; jpayne@69: mask_char = QUERY_MASK; jpayne@69: break; jpayne@69: jpayne@69: case 'r' : jpayne@69: strcpy (InputFileName, optarg); jpayne@69: isReference = true; jpayne@69: mask_char = REFERENCE_MASK; jpayne@69: break; jpayne@69: jpayne@69: default : jpayne@69: errflg ++; jpayne@69: } jpayne@69: jpayne@69: if ( errflg > 0 || argc - optind != 0 || jpayne@69: (isReference && isQuery) || (!isReference && !isQuery) ) jpayne@69: { jpayne@69: printUsage (argv[0]); jpayne@69: exit (EXIT_FAILURE); jpayne@69: } jpayne@69: jpayne@69: if ( mask_len < 0 ) jpayne@69: { jpayne@69: fprintf (stderr, jpayne@69: "WARNING: Invalid maximum mask length %d, ignored\n", mask_len); jpayne@69: mask_len = DEFAULT_MASK_LEN; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: InputFile = File_Open (InputFileName, "r"); jpayne@69: jpayne@69: InitSize = INIT_SIZE; jpayne@69: A = (char *) Safe_malloc ( sizeof(char) * InitSize ); jpayne@69: tA = (char *) Safe_malloc ( sizeof(char) ); jpayne@69: tA [0] = '\0'; jpayne@69: jpayne@69: ct = 0; jpayne@69: if ( isReference ) jpayne@69: printf (">allcontigs %s\n", InputFileName); jpayne@69: while ( Read_String (InputFile, A, InitSize, Id, FALSE) ) jpayne@69: { jpayne@69: LenA = strlen(A + 1); jpayne@69: jpayne@69: for ( frame = 1; frame <= 6; frame ++ ) jpayne@69: { jpayne@69: if ( isQuery ) jpayne@69: printf (">%s.%d\n", Id, frame); jpayne@69: jpayne@69: //-- Translate the current frame jpayne@69: tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) ); jpayne@69: LentA = Translate_DNA (A, tA, frame); jpayne@69: tA[++ LentA] = mask_char; jpayne@69: jpayne@69: //-- Mask the current frame jpayne@69: last_index = 0; jpayne@69: for ( i = 1; i <= LentA; i ++ ) jpayne@69: { jpayne@69: if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK ) jpayne@69: tA[i] = mask_char; jpayne@69: else if ( tA[i] == STOP_CHAR ) jpayne@69: { jpayne@69: tA[i] = STOP_MASK; jpayne@69: if ( i - last_index - 1 <= mask_len ) jpayne@69: mask (tA, mask_char, last_index + 1, i - 1); jpayne@69: last_index = i; jpayne@69: } jpayne@69: } jpayne@69: if ( LentA - last_index - 1 <= mask_len ) jpayne@69: mask (tA, mask_char, last_index + 1, i - 1); jpayne@69: jpayne@69: //-- Print the current frame jpayne@69: for ( i = 1; i <= LentA; i ++ ) jpayne@69: { jpayne@69: fputc (tA[i], stdout); jpayne@69: if ( ++ ct == CHARS_PER_LINE ) jpayne@69: { jpayne@69: ct = 0; jpayne@69: fputc ('\n', stdout); jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: if ( isQuery ) jpayne@69: { jpayne@69: if ( ct != 0 ) jpayne@69: fputc ('\n', stdout); jpayne@69: ct = 0; jpayne@69: } jpayne@69: } jpayne@69: } jpayne@69: if ( ct != 0 ) jpayne@69: fputc ('\n', stdout); jpayne@69: jpayne@69: fclose(InputFile); jpayne@69: jpayne@69: free(A); jpayne@69: free(tA); jpayne@69: jpayne@69: return EXIT_SUCCESS; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: jpayne@69: inline void mask jpayne@69: (char * A, char mask_ch, long int x, long int y) jpayne@69: jpayne@69: // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive) jpayne@69: jpayne@69: { jpayne@69: for ( ; x <= y; x ++ ) jpayne@69: A[x] = mask_ch; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: jpayne@69: void printHelp jpayne@69: (const char * s) jpayne@69: { jpayne@69: fprintf(stderr, jpayne@69: "\nUSAGE: %s [options] -r/-q \n\n", s); jpayne@69: fprintf(stderr, jpayne@69: "-h display help information\n" jpayne@69: "-m len set maximum book-end masking length to 'len\n" jpayne@69: "-q query input is the multi-fasta query file 'query'\n" jpayne@69: "-r reference input is the multi-fasta reference file 'reference'\n\n" jpayne@69: " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n" jpayne@69: "'-q query'. Both are not allowed.\n" jpayne@69: " Output is to stdout, and it consists of each sequence in the\n" jpayne@69: "FASTA file translated in all six reading frames. This output is\n" jpayne@69: "different depending on whether the the input was the reference\n" jpayne@69: "or query sequence, and it is now ready to be passed to 'mummer2'\n" jpayne@69: "for the match finding step.\n\n"); jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: jpayne@69: void printUsage jpayne@69: (const char * s) jpayne@69: { jpayne@69: fprintf(stderr, jpayne@69: "\nUSAGE: %s [options] -r/-q \n\n", s); jpayne@69: fprintf (stderr, "Try '%s -h' for more information.\n", s); jpayne@69: return; jpayne@69: }