Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 //------------------------------------------------------------------------------ | |
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research | |
3 // File: prepro.cc | |
4 // Date: 08 / 16 / 2002 | |
5 // | |
6 // Input: Input is a single multi-FASTA sequence file on the command | |
7 // line, the command line switch '-r' specifies that the input | |
8 // is the reference sequence. This switch tells the program to | |
9 // suppress all but the first header and use a different masking | |
10 // character than the query sequence. If it is the query sequence | |
11 // being input, use the '-q' option. Either -r or -q must be specified. | |
12 // | |
13 // Output: Output is to stdout, and consists of each sequence in the input | |
14 // translated into all six reading frames. All the translations | |
15 // for a particular sequence are appended together and each | |
16 // seperated by a masking character. If the '-r' flag is specified, | |
17 // all but the top header are removed from the output, if the '-q' | |
18 // flag is specified each sequence header is left in place but the | |
19 // translated frames remain concatenated together without headers. | |
20 // In addition to translating the sequences, prepro also does | |
21 // a simple masking step that masks amino acids that are | |
22 // surrounded on either side with stop codons. The '-m len' option | |
23 // allows the user to specify the maximum length of these regions | |
24 // to be masked. For example, if '-m 5' were set and the sequence | |
25 // "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and | |
26 // Len+1 indices (although non-existent) are considered as stop codons | |
27 // so this masking with -m 5 could turn the beginning of a sequence | |
28 // "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0". | |
29 // | |
30 // Usage: prepro [-m len] -r/-q <multi-FASTA> | |
31 // | |
32 //------------------------------------------------------------------------------ | |
33 | |
34 #include "tigrinc.hh" | |
35 #include "translate.hh" | |
36 | |
37 //-- Output this many sequence characters per line | |
38 #define CHARS_PER_LINE 60 | |
39 | |
40 const long int DEFAULT_MASK_LEN = 10; | |
41 | |
42 const char TRANSLATE_MASK = 'X'; // translator masking character | |
43 const char REFERENCE_MASK = 'X'; // masking character to use for reference | |
44 const char QUERY_MASK = 'O'; // masking character to use for query | |
45 const char STOP_MASK = 'J'; // alpha character for stop codons | |
46 const char STOP_CHAR = '*'; // stop codon character | |
47 | |
48 inline void mask | |
49 (char * A, char mask_ch, long int x, long int y); | |
50 | |
51 void printHelp | |
52 (const char *); | |
53 | |
54 void printUsage | |
55 (const char *); | |
56 | |
57 | |
58 | |
59 | |
60 int main | |
61 (int argc, char * argv[]) | |
62 { | |
63 bool isReference = false; | |
64 bool isQuery = false; | |
65 | |
66 int frame; | |
67 int mask_len = DEFAULT_MASK_LEN; | |
68 | |
69 long int InitSize, LenA, LentA, ct, i; | |
70 long int last_index; | |
71 | |
72 char * A, * tA; | |
73 char mask_char = 0; | |
74 char Id [MAX_LINE]; | |
75 char InputFileName [MAX_LINE]; | |
76 | |
77 FILE * InputFile; | |
78 | |
79 //-- Parse the command line arguments | |
80 { | |
81 optarg = NULL; | |
82 int ch, errflg = 0; | |
83 while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) ) | |
84 switch (ch) | |
85 { | |
86 case 'h' : | |
87 printHelp (argv[0]); | |
88 exit (EXIT_SUCCESS); | |
89 break; | |
90 | |
91 case 'm' : | |
92 mask_len = atoi (optarg); | |
93 break; | |
94 | |
95 case 'q' : | |
96 strcpy (InputFileName, optarg); | |
97 isQuery = true; | |
98 mask_char = QUERY_MASK; | |
99 break; | |
100 | |
101 case 'r' : | |
102 strcpy (InputFileName, optarg); | |
103 isReference = true; | |
104 mask_char = REFERENCE_MASK; | |
105 break; | |
106 | |
107 default : | |
108 errflg ++; | |
109 } | |
110 | |
111 if ( errflg > 0 || argc - optind != 0 || | |
112 (isReference && isQuery) || (!isReference && !isQuery) ) | |
113 { | |
114 printUsage (argv[0]); | |
115 exit (EXIT_FAILURE); | |
116 } | |
117 | |
118 if ( mask_len < 0 ) | |
119 { | |
120 fprintf (stderr, | |
121 "WARNING: Invalid maximum mask length %d, ignored\n", mask_len); | |
122 mask_len = DEFAULT_MASK_LEN; | |
123 } | |
124 } | |
125 | |
126 InputFile = File_Open (InputFileName, "r"); | |
127 | |
128 InitSize = INIT_SIZE; | |
129 A = (char *) Safe_malloc ( sizeof(char) * InitSize ); | |
130 tA = (char *) Safe_malloc ( sizeof(char) ); | |
131 tA [0] = '\0'; | |
132 | |
133 ct = 0; | |
134 if ( isReference ) | |
135 printf (">allcontigs %s\n", InputFileName); | |
136 while ( Read_String (InputFile, A, InitSize, Id, FALSE) ) | |
137 { | |
138 LenA = strlen(A + 1); | |
139 | |
140 for ( frame = 1; frame <= 6; frame ++ ) | |
141 { | |
142 if ( isQuery ) | |
143 printf (">%s.%d\n", Id, frame); | |
144 | |
145 //-- Translate the current frame | |
146 tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) ); | |
147 LentA = Translate_DNA (A, tA, frame); | |
148 tA[++ LentA] = mask_char; | |
149 | |
150 //-- Mask the current frame | |
151 last_index = 0; | |
152 for ( i = 1; i <= LentA; i ++ ) | |
153 { | |
154 if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK ) | |
155 tA[i] = mask_char; | |
156 else if ( tA[i] == STOP_CHAR ) | |
157 { | |
158 tA[i] = STOP_MASK; | |
159 if ( i - last_index - 1 <= mask_len ) | |
160 mask (tA, mask_char, last_index + 1, i - 1); | |
161 last_index = i; | |
162 } | |
163 } | |
164 if ( LentA - last_index - 1 <= mask_len ) | |
165 mask (tA, mask_char, last_index + 1, i - 1); | |
166 | |
167 //-- Print the current frame | |
168 for ( i = 1; i <= LentA; i ++ ) | |
169 { | |
170 fputc (tA[i], stdout); | |
171 if ( ++ ct == CHARS_PER_LINE ) | |
172 { | |
173 ct = 0; | |
174 fputc ('\n', stdout); | |
175 } | |
176 } | |
177 | |
178 if ( isQuery ) | |
179 { | |
180 if ( ct != 0 ) | |
181 fputc ('\n', stdout); | |
182 ct = 0; | |
183 } | |
184 } | |
185 } | |
186 if ( ct != 0 ) | |
187 fputc ('\n', stdout); | |
188 | |
189 fclose(InputFile); | |
190 | |
191 free(A); | |
192 free(tA); | |
193 | |
194 return EXIT_SUCCESS; | |
195 } | |
196 | |
197 | |
198 | |
199 | |
200 inline void mask | |
201 (char * A, char mask_ch, long int x, long int y) | |
202 | |
203 // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive) | |
204 | |
205 { | |
206 for ( ; x <= y; x ++ ) | |
207 A[x] = mask_ch; | |
208 } | |
209 | |
210 | |
211 | |
212 | |
213 void printHelp | |
214 (const char * s) | |
215 { | |
216 fprintf(stderr, | |
217 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s); | |
218 fprintf(stderr, | |
219 "-h display help information\n" | |
220 "-m len set maximum book-end masking length to 'len\n" | |
221 "-q query input is the multi-fasta query file 'query'\n" | |
222 "-r reference input is the multi-fasta reference file 'reference'\n\n" | |
223 " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n" | |
224 "'-q query'. Both are not allowed.\n" | |
225 " Output is to stdout, and it consists of each sequence in the\n" | |
226 "FASTA file translated in all six reading frames. This output is\n" | |
227 "different depending on whether the the input was the reference\n" | |
228 "or query sequence, and it is now ready to be passed to 'mummer2'\n" | |
229 "for the match finding step.\n\n"); | |
230 return; | |
231 } | |
232 | |
233 | |
234 | |
235 | |
236 void printUsage | |
237 (const char * s) | |
238 { | |
239 fprintf(stderr, | |
240 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s); | |
241 fprintf (stderr, "Try '%s -h' for more information.\n", s); | |
242 return; | |
243 } |