comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/prepro.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 //------------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
3 // File: prepro.cc
4 // Date: 08 / 16 / 2002
5 //
6 // Input: Input is a single multi-FASTA sequence file on the command
7 // line, the command line switch '-r' specifies that the input
8 // is the reference sequence. This switch tells the program to
9 // suppress all but the first header and use a different masking
10 // character than the query sequence. If it is the query sequence
11 // being input, use the '-q' option. Either -r or -q must be specified.
12 //
13 // Output: Output is to stdout, and consists of each sequence in the input
14 // translated into all six reading frames. All the translations
15 // for a particular sequence are appended together and each
16 // seperated by a masking character. If the '-r' flag is specified,
17 // all but the top header are removed from the output, if the '-q'
18 // flag is specified each sequence header is left in place but the
19 // translated frames remain concatenated together without headers.
20 // In addition to translating the sequences, prepro also does
21 // a simple masking step that masks amino acids that are
22 // surrounded on either side with stop codons. The '-m len' option
23 // allows the user to specify the maximum length of these regions
24 // to be masked. For example, if '-m 5' were set and the sequence
25 // "A*AAAAA*A" appeared, the output would be "A*XXXXX*A". The 0 and
26 // Len+1 indices (although non-existent) are considered as stop codons
27 // so this masking with -m 5 could turn the beginning of a sequence
28 // "\0AAAAA*" into "\0XXXXX*" or the end "*AAAAA\0" into "*XXXXX\0".
29 //
30 // Usage: prepro [-m len] -r/-q <multi-FASTA>
31 //
32 //------------------------------------------------------------------------------
33
34 #include "tigrinc.hh"
35 #include "translate.hh"
36
37 //-- Output this many sequence characters per line
38 #define CHARS_PER_LINE 60
39
40 const long int DEFAULT_MASK_LEN = 10;
41
42 const char TRANSLATE_MASK = 'X'; // translator masking character
43 const char REFERENCE_MASK = 'X'; // masking character to use for reference
44 const char QUERY_MASK = 'O'; // masking character to use for query
45 const char STOP_MASK = 'J'; // alpha character for stop codons
46 const char STOP_CHAR = '*'; // stop codon character
47
48 inline void mask
49 (char * A, char mask_ch, long int x, long int y);
50
51 void printHelp
52 (const char *);
53
54 void printUsage
55 (const char *);
56
57
58
59
60 int main
61 (int argc, char * argv[])
62 {
63 bool isReference = false;
64 bool isQuery = false;
65
66 int frame;
67 int mask_len = DEFAULT_MASK_LEN;
68
69 long int InitSize, LenA, LentA, ct, i;
70 long int last_index;
71
72 char * A, * tA;
73 char mask_char = 0;
74 char Id [MAX_LINE];
75 char InputFileName [MAX_LINE];
76
77 FILE * InputFile;
78
79 //-- Parse the command line arguments
80 {
81 optarg = NULL;
82 int ch, errflg = 0;
83 while ( !errflg && ((ch = getopt (argc, argv, "hm:q:r:")) != EOF) )
84 switch (ch)
85 {
86 case 'h' :
87 printHelp (argv[0]);
88 exit (EXIT_SUCCESS);
89 break;
90
91 case 'm' :
92 mask_len = atoi (optarg);
93 break;
94
95 case 'q' :
96 strcpy (InputFileName, optarg);
97 isQuery = true;
98 mask_char = QUERY_MASK;
99 break;
100
101 case 'r' :
102 strcpy (InputFileName, optarg);
103 isReference = true;
104 mask_char = REFERENCE_MASK;
105 break;
106
107 default :
108 errflg ++;
109 }
110
111 if ( errflg > 0 || argc - optind != 0 ||
112 (isReference && isQuery) || (!isReference && !isQuery) )
113 {
114 printUsage (argv[0]);
115 exit (EXIT_FAILURE);
116 }
117
118 if ( mask_len < 0 )
119 {
120 fprintf (stderr,
121 "WARNING: Invalid maximum mask length %d, ignored\n", mask_len);
122 mask_len = DEFAULT_MASK_LEN;
123 }
124 }
125
126 InputFile = File_Open (InputFileName, "r");
127
128 InitSize = INIT_SIZE;
129 A = (char *) Safe_malloc ( sizeof(char) * InitSize );
130 tA = (char *) Safe_malloc ( sizeof(char) );
131 tA [0] = '\0';
132
133 ct = 0;
134 if ( isReference )
135 printf (">allcontigs %s\n", InputFileName);
136 while ( Read_String (InputFile, A, InitSize, Id, FALSE) )
137 {
138 LenA = strlen(A + 1);
139
140 for ( frame = 1; frame <= 6; frame ++ )
141 {
142 if ( isQuery )
143 printf (">%s.%d\n", Id, frame);
144
145 //-- Translate the current frame
146 tA = (char *) Safe_realloc (tA, sizeof(char) * ( (LenA / 3) + 2) );
147 LentA = Translate_DNA (A, tA, frame);
148 tA[++ LentA] = mask_char;
149
150 //-- Mask the current frame
151 last_index = 0;
152 for ( i = 1; i <= LentA; i ++ )
153 {
154 if ( mask_char != TRANSLATE_MASK && tA[i] == TRANSLATE_MASK )
155 tA[i] = mask_char;
156 else if ( tA[i] == STOP_CHAR )
157 {
158 tA[i] = STOP_MASK;
159 if ( i - last_index - 1 <= mask_len )
160 mask (tA, mask_char, last_index + 1, i - 1);
161 last_index = i;
162 }
163 }
164 if ( LentA - last_index - 1 <= mask_len )
165 mask (tA, mask_char, last_index + 1, i - 1);
166
167 //-- Print the current frame
168 for ( i = 1; i <= LentA; i ++ )
169 {
170 fputc (tA[i], stdout);
171 if ( ++ ct == CHARS_PER_LINE )
172 {
173 ct = 0;
174 fputc ('\n', stdout);
175 }
176 }
177
178 if ( isQuery )
179 {
180 if ( ct != 0 )
181 fputc ('\n', stdout);
182 ct = 0;
183 }
184 }
185 }
186 if ( ct != 0 )
187 fputc ('\n', stdout);
188
189 fclose(InputFile);
190
191 free(A);
192 free(tA);
193
194 return EXIT_SUCCESS;
195 }
196
197
198
199
200 inline void mask
201 (char * A, char mask_ch, long int x, long int y)
202
203 // Mask sequence 'A' with 'mask_ch' from A [x...y] (inclusive)
204
205 {
206 for ( ; x <= y; x ++ )
207 A[x] = mask_ch;
208 }
209
210
211
212
213 void printHelp
214 (const char * s)
215 {
216 fprintf(stderr,
217 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
218 fprintf(stderr,
219 "-h display help information\n"
220 "-m len set maximum book-end masking length to 'len\n"
221 "-q query input is the multi-fasta query file 'query'\n"
222 "-r reference input is the multi-fasta reference file 'reference'\n\n"
223 " Input is one multi-fasta sequence file, EITHER '-r reference' OR\n"
224 "'-q query'. Both are not allowed.\n"
225 " Output is to stdout, and it consists of each sequence in the\n"
226 "FASTA file translated in all six reading frames. This output is\n"
227 "different depending on whether the the input was the reference\n"
228 "or query sequence, and it is now ready to be passed to 'mummer2'\n"
229 "for the match finding step.\n\n");
230 return;
231 }
232
233
234
235
236 void printUsage
237 (const char * s)
238 {
239 fprintf(stderr,
240 "\nUSAGE: %s [options] -r/-q <fasta>\n\n", s);
241 fprintf (stderr, "Try '%s -h' for more information.\n", s);
242 return;
243 }