annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 //------------------------------------------------------------------------------
jpayne@69 2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
jpayne@69 3 // File: prepro.cc
jpayne@69 4 // Date: 08 / 16 / 2002
jpayne@69 5 //
jpayne@69 6 // Input: Input is a single multi-FASTA sequence file on the command
jpayne@69 7 // line, the command line switch '-r' specifies that the input
jpayne@69 8 // is the reference sequence. This switch tells the program to
jpayne@69 9 // suppress all but the first header and use a different masking
jpayne@69 10 // character than the query sequence. If it is the query sequence
jpayne@69 11 // being input, use the '-q' option. Either -r or -q must be specified.
jpayne@69 12 //
jpayne@69 13 // Output: Output is to stdout, and consists of each sequence in the input
jpayne@69 14 // translated into all six reading frames. All the translations
jpayne@69 15 // for a particular sequence are appended together and each
jpayne@69 16 // seperated by a masking character. If the '-r' flag is specified,
jpayne@69 17 // all but the top header are removed from the output, if the '-q'
jpayne@69 18 // flag is specified each sequence header is left in place but the
jpayne@69 19 // translated frames remain concatenated together without headers.
jpayne@69 20 // In addition to translating the sequences, prepro also does
jpayne@69 21 // a simple masking step that masks amino acids that are
jpayne@69 22 // surrounded on either side with stop codons. The '-m len' option
jpayne@69 23 // allows the user to specify the maximum length of these regions
jpayne@69 24 // to be masked. For example, if '-m 5' were set and the sequence
jpayne@69 25 // "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and
jpayne@69 26 // Len+1 indices (although non-existent) are considered as stop codons
jpayne@69 27 // so this masking with -m 5 could turn the beginning of a sequence
jpayne@69 28 // "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0".
jpayne@69 29 //
jpayne@69 30 // Usage: prepro [-m len] -r/-q <multi-FASTA>
jpayne@69 31 //
jpayne@69 32 //------------------------------------------------------------------------------
jpayne@69 33
jpayne@69 34 #include "tigrinc.hh"
jpayne@69 35 #include "translate.hh"
jpayne@69 36
jpayne@69 37 //-- Output this many sequence characters per line
jpayne@69 38 #define CHARS_PER_LINE 60
jpayne@69 39
jpayne@69 40 const long int DEFAULT_MASK_LEN = 10;
jpayne@69 41
jpayne@69 42 const char TRANSLATE_MASK = 'X'; // translator masking character
jpayne@69 43 const char REFERENCE_MASK = 'X'; // masking character to use for reference
jpayne@69 44 const char QUERY_MASK = 'O'; // masking character to use for query
jpayne@69 45 const char STOP_MASK = 'J'; // alpha character for stop codons
jpayne@69 46 const char STOP_CHAR = '*'; // stop codon character
jpayne@69 47
jpayne@69 48 inline void mask
jpayne@69 49 (char * A, char mask_ch, long int x, long int y);
jpayne@69 50
jpayne@69 51 void printHelp
jpayne@69 52 (const char *);
jpayne@69 53
jpayne@69 54 void printUsage
jpayne@69 55 (const char *);
jpayne@69 56
jpayne@69 57
jpayne@69 58
jpayne@69 59
jpayne@69 60 int main
jpayne@69 61 (int argc, char * argv[])
jpayne@69 62 {
jpayne@69 63 bool isReference = false;
jpayne@69 64 bool isQuery = false;
jpayne@69 65
jpayne@69 66 int frame;
jpayne@69 67 int mask_len = DEFAULT_MASK_LEN;
jpayne@69 68
jpayne@69 69 long int InitSize, LenA, LentA, ct, i;
jpayne@69 70 long int last_index;
jpayne@69 71
jpayne@69 72 char * A, * tA;
jpayne@69 73 char mask_char = 0;
jpayne@69 74 char Id [MAX_LINE];
jpayne@69 75 char InputFileName [MAX_LINE];
jpayne@69 76
jpayne@69 77 FILE * InputFile;
jpayne@69 78
jpayne@69 79 //-- Parse the command line arguments
jpayne@69 80 {
jpayne@69 81 optarg = NULL;
jpayne@69 82 int ch, errflg = 0;
jpayne@69 83 while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) )
jpayne@69 84 switch (ch)
jpayne@69 85 {
jpayne@69 86 case 'h' :
jpayne@69 87 printHelp (argv[0]);
jpayne@69 88 exit (EXIT_SUCCESS);
jpayne@69 89 break;
jpayne@69 90
jpayne@69 91 case 'm' :
jpayne@69 92 mask_len = atoi (optarg);
jpayne@69 93 break;
jpayne@69 94
jpayne@69 95 case 'q' :
jpayne@69 96 strcpy (InputFileName, optarg);
jpayne@69 97 isQuery = true;
jpayne@69 98 mask_char = QUERY_MASK;
jpayne@69 99 break;
jpayne@69 100
jpayne@69 101 case 'r' :
jpayne@69 102 strcpy (InputFileName, optarg);
jpayne@69 103 isReference = true;
jpayne@69 104 mask_char = REFERENCE_MASK;
jpayne@69 105 break;
jpayne@69 106
jpayne@69 107 default :
jpayne@69 108 errflg ++;
jpayne@69 109 }
jpayne@69 110
jpayne@69 111 if ( errflg > 0 || argc - optind != 0 ||
jpayne@69 112 (isReference && isQuery) || (!isReference && !isQuery) )
jpayne@69 113 {
jpayne@69 114 printUsage (argv[0]);
jpayne@69 115 exit (EXIT_FAILURE);
jpayne@69 116 }
jpayne@69 117
jpayne@69 118 if ( mask_len < 0 )
jpayne@69 119 {
jpayne@69 120 fprintf (stderr,
jpayne@69 121 "WARNING: Invalid maximum mask length %d, ignored\n", mask_len);
jpayne@69 122 mask_len = DEFAULT_MASK_LEN;
jpayne@69 123 }
jpayne@69 124 }
jpayne@69 125
jpayne@69 126 InputFile = File_Open (InputFileName, "r");
jpayne@69 127
jpayne@69 128 InitSize = INIT_SIZE;
jpayne@69 129 A = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 130 tA = (char *) Safe_malloc ( sizeof(char) );
jpayne@69 131 tA [0] = '\0';
jpayne@69 132
jpayne@69 133 ct = 0;
jpayne@69 134 if ( isReference )
jpayne@69 135 printf (">allcontigs %s\n", InputFileName);
jpayne@69 136 while ( Read_String (InputFile, A, InitSize, Id, FALSE) )
jpayne@69 137 {
jpayne@69 138 LenA = strlen(A + 1);
jpayne@69 139
jpayne@69 140 for ( frame = 1; frame <= 6; frame ++ )
jpayne@69 141 {
jpayne@69 142 if ( isQuery )
jpayne@69 143 printf (">%s.%d\n", Id, frame);
jpayne@69 144
jpayne@69 145 //-- Translate the current frame
jpayne@69 146 tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) );
jpayne@69 147 LentA = Translate_DNA (A, tA, frame);
jpayne@69 148 tA[++ LentA] = mask_char;
jpayne@69 149
jpayne@69 150 //-- Mask the current frame
jpayne@69 151 last_index = 0;
jpayne@69 152 for ( i = 1; i <= LentA; i ++ )
jpayne@69 153 {
jpayne@69 154 if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK )
jpayne@69 155 tA[i] = mask_char;
jpayne@69 156 else if ( tA[i] == STOP_CHAR )
jpayne@69 157 {
jpayne@69 158 tA[i] = STOP_MASK;
jpayne@69 159 if ( i - last_index - 1 <= mask_len )
jpayne@69 160 mask (tA, mask_char, last_index + 1, i - 1);
jpayne@69 161 last_index = i;
jpayne@69 162 }
jpayne@69 163 }
jpayne@69 164 if ( LentA - last_index - 1 <= mask_len )
jpayne@69 165 mask (tA, mask_char, last_index + 1, i - 1);
jpayne@69 166
jpayne@69 167 //-- Print the current frame
jpayne@69 168 for ( i = 1; i <= LentA; i ++ )
jpayne@69 169 {
jpayne@69 170 fputc (tA[i], stdout);
jpayne@69 171 if ( ++ ct == CHARS_PER_LINE )
jpayne@69 172 {
jpayne@69 173 ct = 0;
jpayne@69 174 fputc ('\n', stdout);
jpayne@69 175 }
jpayne@69 176 }
jpayne@69 177
jpayne@69 178 if ( isQuery )
jpayne@69 179 {
jpayne@69 180 if ( ct != 0 )
jpayne@69 181 fputc ('\n', stdout);
jpayne@69 182 ct = 0;
jpayne@69 183 }
jpayne@69 184 }
jpayne@69 185 }
jpayne@69 186 if ( ct != 0 )
jpayne@69 187 fputc ('\n', stdout);
jpayne@69 188
jpayne@69 189 fclose(InputFile);
jpayne@69 190
jpayne@69 191 free(A);
jpayne@69 192 free(tA);
jpayne@69 193
jpayne@69 194 return EXIT_SUCCESS;
jpayne@69 195 }
jpayne@69 196
jpayne@69 197
jpayne@69 198
jpayne@69 199
jpayne@69 200 inline void mask
jpayne@69 201 (char * A, char mask_ch, long int x, long int y)
jpayne@69 202
jpayne@69 203 // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive)
jpayne@69 204
jpayne@69 205 {
jpayne@69 206 for ( ; x <= y; x ++ )
jpayne@69 207 A[x] = mask_ch;
jpayne@69 208 }
jpayne@69 209
jpayne@69 210
jpayne@69 211
jpayne@69 212
jpayne@69 213 void printHelp
jpayne@69 214 (const char * s)
jpayne@69 215 {
jpayne@69 216 fprintf(stderr,
jpayne@69 217 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
jpayne@69 218 fprintf(stderr,
jpayne@69 219 "-h display help information\n"
jpayne@69 220 "-m len set maximum book-end masking length to 'len\n"
jpayne@69 221 "-q query input is the multi-fasta query file 'query'\n"
jpayne@69 222 "-r reference input is the multi-fasta reference file 'reference'\n\n"
jpayne@69 223 " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n"
jpayne@69 224 "'-q query'. Both are not allowed.\n"
jpayne@69 225 " Output is to stdout, and it consists of each sequence in the\n"
jpayne@69 226 "FASTA file translated in all six reading frames. This output is\n"
jpayne@69 227 "different depending on whether the the input was the reference\n"
jpayne@69 228 "or query sequence, and it is now ready to be passed to 'mummer2'\n"
jpayne@69 229 "for the match finding step.\n\n");
jpayne@69 230 return;
jpayne@69 231 }
jpayne@69 232
jpayne@69 233
jpayne@69 234
jpayne@69 235
jpayne@69 236 void printUsage
jpayne@69 237 (const char * s)
jpayne@69 238 {
jpayne@69 239 fprintf(stderr,
jpayne@69 240 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
jpayne@69 241 fprintf (stderr, "Try '%s -h' for more information.\n", s);
jpayne@69 242 return;
jpayne@69 243 }