jpayne@69
|
1 //------------------------------------------------------------------------------
|
jpayne@69
|
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
|
jpayne@69
|
3 // File: prepro.cc
|
jpayne@69
|
4 // Date: 08 / 16 / 2002
|
jpayne@69
|
5 //
|
jpayne@69
|
6 // Input: Input is a single multi-FASTA sequence file on the command
|
jpayne@69
|
7 // line, the command line switch '-r' specifies that the input
|
jpayne@69
|
8 // is the reference sequence. This switch tells the program to
|
jpayne@69
|
9 // suppress all but the first header and use a different masking
|
jpayne@69
|
10 // character than the query sequence. If it is the query sequence
|
jpayne@69
|
11 // being input, use the '-q' option. Either -r or -q must be specified.
|
jpayne@69
|
12 //
|
jpayne@69
|
13 // Output: Output is to stdout, and consists of each sequence in the input
|
jpayne@69
|
14 // translated into all six reading frames. All the translations
|
jpayne@69
|
15 // for a particular sequence are appended together and each
|
jpayne@69
|
16 // seperated by a masking character. If the '-r' flag is specified,
|
jpayne@69
|
17 // all but the top header are removed from the output, if the '-q'
|
jpayne@69
|
18 // flag is specified each sequence header is left in place but the
|
jpayne@69
|
19 // translated frames remain concatenated together without headers.
|
jpayne@69
|
20 // In addition to translating the sequences, prepro also does
|
jpayne@69
|
21 // a simple masking step that masks amino acids that are
|
jpayne@69
|
22 // surrounded on either side with stop codons. The '-m len' option
|
jpayne@69
|
23 // allows the user to specify the maximum length of these regions
|
jpayne@69
|
24 // to be masked. For example, if '-m 5' were set and the sequence
|
jpayne@69
|
25 // "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and
|
jpayne@69
|
26 // Len+1 indices (although non-existent) are considered as stop codons
|
jpayne@69
|
27 // so this masking with -m 5 could turn the beginning of a sequence
|
jpayne@69
|
28 // "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0".
|
jpayne@69
|
29 //
|
jpayne@69
|
30 // Usage: prepro [-m len] -r/-q <multi-FASTA>
|
jpayne@69
|
31 //
|
jpayne@69
|
32 //------------------------------------------------------------------------------
|
jpayne@69
|
33
|
jpayne@69
|
34 #include "tigrinc.hh"
|
jpayne@69
|
35 #include "translate.hh"
|
jpayne@69
|
36
|
jpayne@69
|
37 //-- Output this many sequence characters per line
|
jpayne@69
|
38 #define CHARS_PER_LINE 60
|
jpayne@69
|
39
|
jpayne@69
|
40 const long int DEFAULT_MASK_LEN = 10;
|
jpayne@69
|
41
|
jpayne@69
|
42 const char TRANSLATE_MASK = 'X'; // translator masking character
|
jpayne@69
|
43 const char REFERENCE_MASK = 'X'; // masking character to use for reference
|
jpayne@69
|
44 const char QUERY_MASK = 'O'; // masking character to use for query
|
jpayne@69
|
45 const char STOP_MASK = 'J'; // alpha character for stop codons
|
jpayne@69
|
46 const char STOP_CHAR = '*'; // stop codon character
|
jpayne@69
|
47
|
jpayne@69
|
48 inline void mask
|
jpayne@69
|
49 (char * A, char mask_ch, long int x, long int y);
|
jpayne@69
|
50
|
jpayne@69
|
51 void printHelp
|
jpayne@69
|
52 (const char *);
|
jpayne@69
|
53
|
jpayne@69
|
54 void printUsage
|
jpayne@69
|
55 (const char *);
|
jpayne@69
|
56
|
jpayne@69
|
57
|
jpayne@69
|
58
|
jpayne@69
|
59
|
jpayne@69
|
60 int main
|
jpayne@69
|
61 (int argc, char * argv[])
|
jpayne@69
|
62 {
|
jpayne@69
|
63 bool isReference = false;
|
jpayne@69
|
64 bool isQuery = false;
|
jpayne@69
|
65
|
jpayne@69
|
66 int frame;
|
jpayne@69
|
67 int mask_len = DEFAULT_MASK_LEN;
|
jpayne@69
|
68
|
jpayne@69
|
69 long int InitSize, LenA, LentA, ct, i;
|
jpayne@69
|
70 long int last_index;
|
jpayne@69
|
71
|
jpayne@69
|
72 char * A, * tA;
|
jpayne@69
|
73 char mask_char = 0;
|
jpayne@69
|
74 char Id [MAX_LINE];
|
jpayne@69
|
75 char InputFileName [MAX_LINE];
|
jpayne@69
|
76
|
jpayne@69
|
77 FILE * InputFile;
|
jpayne@69
|
78
|
jpayne@69
|
79 //-- Parse the command line arguments
|
jpayne@69
|
80 {
|
jpayne@69
|
81 optarg = NULL;
|
jpayne@69
|
82 int ch, errflg = 0;
|
jpayne@69
|
83 while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) )
|
jpayne@69
|
84 switch (ch)
|
jpayne@69
|
85 {
|
jpayne@69
|
86 case 'h' :
|
jpayne@69
|
87 printHelp (argv[0]);
|
jpayne@69
|
88 exit (EXIT_SUCCESS);
|
jpayne@69
|
89 break;
|
jpayne@69
|
90
|
jpayne@69
|
91 case 'm' :
|
jpayne@69
|
92 mask_len = atoi (optarg);
|
jpayne@69
|
93 break;
|
jpayne@69
|
94
|
jpayne@69
|
95 case 'q' :
|
jpayne@69
|
96 strcpy (InputFileName, optarg);
|
jpayne@69
|
97 isQuery = true;
|
jpayne@69
|
98 mask_char = QUERY_MASK;
|
jpayne@69
|
99 break;
|
jpayne@69
|
100
|
jpayne@69
|
101 case 'r' :
|
jpayne@69
|
102 strcpy (InputFileName, optarg);
|
jpayne@69
|
103 isReference = true;
|
jpayne@69
|
104 mask_char = REFERENCE_MASK;
|
jpayne@69
|
105 break;
|
jpayne@69
|
106
|
jpayne@69
|
107 default :
|
jpayne@69
|
108 errflg ++;
|
jpayne@69
|
109 }
|
jpayne@69
|
110
|
jpayne@69
|
111 if ( errflg > 0 || argc - optind != 0 ||
|
jpayne@69
|
112 (isReference && isQuery) || (!isReference && !isQuery) )
|
jpayne@69
|
113 {
|
jpayne@69
|
114 printUsage (argv[0]);
|
jpayne@69
|
115 exit (EXIT_FAILURE);
|
jpayne@69
|
116 }
|
jpayne@69
|
117
|
jpayne@69
|
118 if ( mask_len < 0 )
|
jpayne@69
|
119 {
|
jpayne@69
|
120 fprintf (stderr,
|
jpayne@69
|
121 "WARNING: Invalid maximum mask length %d, ignored\n", mask_len);
|
jpayne@69
|
122 mask_len = DEFAULT_MASK_LEN;
|
jpayne@69
|
123 }
|
jpayne@69
|
124 }
|
jpayne@69
|
125
|
jpayne@69
|
126 InputFile = File_Open (InputFileName, "r");
|
jpayne@69
|
127
|
jpayne@69
|
128 InitSize = INIT_SIZE;
|
jpayne@69
|
129 A = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
130 tA = (char *) Safe_malloc ( sizeof(char) );
|
jpayne@69
|
131 tA [0] = '\0';
|
jpayne@69
|
132
|
jpayne@69
|
133 ct = 0;
|
jpayne@69
|
134 if ( isReference )
|
jpayne@69
|
135 printf (">allcontigs %s\n", InputFileName);
|
jpayne@69
|
136 while ( Read_String (InputFile, A, InitSize, Id, FALSE) )
|
jpayne@69
|
137 {
|
jpayne@69
|
138 LenA = strlen(A + 1);
|
jpayne@69
|
139
|
jpayne@69
|
140 for ( frame = 1; frame <= 6; frame ++ )
|
jpayne@69
|
141 {
|
jpayne@69
|
142 if ( isQuery )
|
jpayne@69
|
143 printf (">%s.%d\n", Id, frame);
|
jpayne@69
|
144
|
jpayne@69
|
145 //-- Translate the current frame
|
jpayne@69
|
146 tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) );
|
jpayne@69
|
147 LentA = Translate_DNA (A, tA, frame);
|
jpayne@69
|
148 tA[++ LentA] = mask_char;
|
jpayne@69
|
149
|
jpayne@69
|
150 //-- Mask the current frame
|
jpayne@69
|
151 last_index = 0;
|
jpayne@69
|
152 for ( i = 1; i <= LentA; i ++ )
|
jpayne@69
|
153 {
|
jpayne@69
|
154 if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK )
|
jpayne@69
|
155 tA[i] = mask_char;
|
jpayne@69
|
156 else if ( tA[i] == STOP_CHAR )
|
jpayne@69
|
157 {
|
jpayne@69
|
158 tA[i] = STOP_MASK;
|
jpayne@69
|
159 if ( i - last_index - 1 <= mask_len )
|
jpayne@69
|
160 mask (tA, mask_char, last_index + 1, i - 1);
|
jpayne@69
|
161 last_index = i;
|
jpayne@69
|
162 }
|
jpayne@69
|
163 }
|
jpayne@69
|
164 if ( LentA - last_index - 1 <= mask_len )
|
jpayne@69
|
165 mask (tA, mask_char, last_index + 1, i - 1);
|
jpayne@69
|
166
|
jpayne@69
|
167 //-- Print the current frame
|
jpayne@69
|
168 for ( i = 1; i <= LentA; i ++ )
|
jpayne@69
|
169 {
|
jpayne@69
|
170 fputc (tA[i], stdout);
|
jpayne@69
|
171 if ( ++ ct == CHARS_PER_LINE )
|
jpayne@69
|
172 {
|
jpayne@69
|
173 ct = 0;
|
jpayne@69
|
174 fputc ('\n', stdout);
|
jpayne@69
|
175 }
|
jpayne@69
|
176 }
|
jpayne@69
|
177
|
jpayne@69
|
178 if ( isQuery )
|
jpayne@69
|
179 {
|
jpayne@69
|
180 if ( ct != 0 )
|
jpayne@69
|
181 fputc ('\n', stdout);
|
jpayne@69
|
182 ct = 0;
|
jpayne@69
|
183 }
|
jpayne@69
|
184 }
|
jpayne@69
|
185 }
|
jpayne@69
|
186 if ( ct != 0 )
|
jpayne@69
|
187 fputc ('\n', stdout);
|
jpayne@69
|
188
|
jpayne@69
|
189 fclose(InputFile);
|
jpayne@69
|
190
|
jpayne@69
|
191 free(A);
|
jpayne@69
|
192 free(tA);
|
jpayne@69
|
193
|
jpayne@69
|
194 return EXIT_SUCCESS;
|
jpayne@69
|
195 }
|
jpayne@69
|
196
|
jpayne@69
|
197
|
jpayne@69
|
198
|
jpayne@69
|
199
|
jpayne@69
|
200 inline void mask
|
jpayne@69
|
201 (char * A, char mask_ch, long int x, long int y)
|
jpayne@69
|
202
|
jpayne@69
|
203 // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive)
|
jpayne@69
|
204
|
jpayne@69
|
205 {
|
jpayne@69
|
206 for ( ; x <= y; x ++ )
|
jpayne@69
|
207 A[x] = mask_ch;
|
jpayne@69
|
208 }
|
jpayne@69
|
209
|
jpayne@69
|
210
|
jpayne@69
|
211
|
jpayne@69
|
212
|
jpayne@69
|
213 void printHelp
|
jpayne@69
|
214 (const char * s)
|
jpayne@69
|
215 {
|
jpayne@69
|
216 fprintf(stderr,
|
jpayne@69
|
217 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
|
jpayne@69
|
218 fprintf(stderr,
|
jpayne@69
|
219 "-h display help information\n"
|
jpayne@69
|
220 "-m len set maximum book-end masking length to 'len\n"
|
jpayne@69
|
221 "-q query input is the multi-fasta query file 'query'\n"
|
jpayne@69
|
222 "-r reference input is the multi-fasta reference file 'reference'\n\n"
|
jpayne@69
|
223 " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n"
|
jpayne@69
|
224 "'-q query'. Both are not allowed.\n"
|
jpayne@69
|
225 " Output is to stdout, and it consists of each sequence in the\n"
|
jpayne@69
|
226 "FASTA file translated in all six reading frames. This output is\n"
|
jpayne@69
|
227 "different depending on whether the the input was the reference\n"
|
jpayne@69
|
228 "or query sequence, and it is now ready to be passed to 'mummer2'\n"
|
jpayne@69
|
229 "for the match finding step.\n\n");
|
jpayne@69
|
230 return;
|
jpayne@69
|
231 }
|
jpayne@69
|
232
|
jpayne@69
|
233
|
jpayne@69
|
234
|
jpayne@69
|
235
|
jpayne@69
|
236 void printUsage
|
jpayne@69
|
237 (const char * s)
|
jpayne@69
|
238 {
|
jpayne@69
|
239 fprintf(stderr,
|
jpayne@69
|
240 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
|
jpayne@69
|
241 fprintf (stderr, "Try '%s -h' for more information.\n", s);
|
jpayne@69
|
242 return;
|
jpayne@69
|
243 }
|