jpayne@69
|
1 /*************************************************
|
jpayne@69
|
2 * Module: CombineMUMs.cc
|
jpayne@69
|
3 * Author: Art Delcher
|
jpayne@69
|
4 * Description:
|
jpayne@69
|
5 * Take clusters of MUMs (from mgaps program) and do alignments
|
jpayne@69
|
6 * to combine them into matches.
|
jpayne@69
|
7 *
|
jpayne@69
|
8 */
|
jpayne@69
|
9
|
jpayne@69
|
10
|
jpayne@69
|
11 #include "tigrinc.hh"
|
jpayne@69
|
12 #include <string>
|
jpayne@69
|
13 using namespace std;
|
jpayne@69
|
14
|
jpayne@69
|
15
|
jpayne@69
|
16 // Constants
|
jpayne@69
|
17
|
jpayne@69
|
18 #define BRANCH_PT_MATCH_VALUE 0.272
|
jpayne@69
|
19 // Value to add for a match in finding branch points
|
jpayne@69
|
20 // 1.20 was the calculated value for 6% vs 35% error discrimination
|
jpayne@69
|
21 // Converting to integers didn't make it faster
|
jpayne@69
|
22 #define BRANCH_PT_ERROR_VALUE -0.728
|
jpayne@69
|
23 // Value to add for a mismatch in finding branch points
|
jpayne@69
|
24 // -2.19 was the calculated value for 6% vs 35% error discrimination
|
jpayne@69
|
25 // Converting to integers didn't make it faster
|
jpayne@69
|
26 #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps"
|
jpayne@69
|
27 // Name of file produced that matches the input gaps file
|
jpayne@69
|
28 // but with an extra column showing the number of errors in
|
jpayne@69
|
29 // each gap
|
jpayne@69
|
30 #define DEFAULT_PAD 10
|
jpayne@69
|
31 // Default number of matching bases to show at beginning
|
jpayne@69
|
32 // and end of alignments
|
jpayne@69
|
33 #define EDIT_DIST_PROB_BOUND 1e-4
|
jpayne@69
|
34 // Probability limit to "band" edit-distance calculation
|
jpayne@69
|
35 // Determines NORMAL_DISTRIB_THOLD
|
jpayne@69
|
36 #define ERRORS_FOR_FREE 1
|
jpayne@69
|
37 // The number of errors that are ignored in setting probability
|
jpayne@69
|
38 // bound for terminating alignment extensions in edit distance
|
jpayne@69
|
39 // calculations
|
jpayne@69
|
40 #define EXPANSION_FACTOR 1.4
|
jpayne@69
|
41 // Factor by which to grow memory when realloc'ing
|
jpayne@69
|
42 #define GIVE_UP_LEN 200
|
jpayne@69
|
43 // Stop alignment when go this many bases past max-score value
|
jpayne@69
|
44 #define INITIAL_BUFFER_SIZE 10000
|
jpayne@69
|
45 // Initial number of bytes to use for sequence strings
|
jpayne@69
|
46 #define MATCH_SCORE 1.0
|
jpayne@69
|
47 // Score to add for match in finding length of alignment
|
jpayne@69
|
48 #define MAX_ERROR_RATE 0.06
|
jpayne@69
|
49 // The largest error allowed in overlaps
|
jpayne@69
|
50 #define MAX_FRAG_LEN 1024
|
jpayne@69
|
51 // The longest fragment allowed
|
jpayne@69
|
52 #define MAX_ERRORS 500
|
jpayne@69
|
53 // Most errors in any edit distance computation
|
jpayne@69
|
54 #define MAX_EXTENSION 10000
|
jpayne@69
|
55 // Maximum extension match out from a match
|
jpayne@69
|
56 #define MAX_HDR_LEN 10000
|
jpayne@69
|
57 // Longest allowable fasta header line
|
jpayne@69
|
58 #define MAX_MEMORY_STORE 50000
|
jpayne@69
|
59 // The most fragments allowed in memory store
|
jpayne@69
|
60 #define MIN_BRANCH_END_DIST 20
|
jpayne@69
|
61 // Branch points must be at least this many bases from the
|
jpayne@69
|
62 // end of the fragment to be reported
|
jpayne@69
|
63 #define MIN_BRANCH_TAIL_SLOPE 0.20
|
jpayne@69
|
64 // Branch point tails must fall off from the max by at least
|
jpayne@69
|
65 // this rate
|
jpayne@69
|
66 #define MIN_MATCH_LEN 40
|
jpayne@69
|
67 // Number of bases in match region in order to count it
|
jpayne@69
|
68 #define MISMATCH_SCORE -3.0
|
jpayne@69
|
69 // Score to add for non-match in finding length of alignment
|
jpayne@69
|
70 #define NORMAL_DISTRIB_THOLD 3.62
|
jpayne@69
|
71 // Determined by EDIT_DIST_PROB_BOUND
|
jpayne@69
|
72 #define REALLY_VERBOSE 0
|
jpayne@69
|
73 // If 1 prints tons of stuff
|
jpayne@69
|
74 #define VERBOSE 0
|
jpayne@69
|
75 // If 1 prints lots of stuff
|
jpayne@69
|
76
|
jpayne@69
|
77
|
jpayne@69
|
78
|
jpayne@69
|
79 // Type definitions
|
jpayne@69
|
80
|
jpayne@69
|
81 typedef struct s_Cover_t
|
jpayne@69
|
82 {
|
jpayne@69
|
83 long int lo, hi;
|
jpayne@69
|
84 struct s_Cover_t * next;
|
jpayne@69
|
85 } Cover_t;
|
jpayne@69
|
86
|
jpayne@69
|
87
|
jpayne@69
|
88
|
jpayne@69
|
89 // Globals
|
jpayne@69
|
90
|
jpayne@69
|
91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE;
|
jpayne@69
|
92 // Score used for matches to locate branch points, i.e.,
|
jpayne@69
|
93 // where alignment score peaks
|
jpayne@69
|
94 float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE;
|
jpayne@69
|
95 // Score used for mismatches to locate branch points, i.e.,
|
jpayne@69
|
96 int Consec_Non_ACGT = 0;
|
jpayne@69
|
97 // Stop alignments when encounter at least this many non-acgt characters.
|
jpayne@69
|
98 int * Edit_Array [MAX_ERRORS];
|
jpayne@69
|
99 // Use for alignment calculation. Points into Edit_Space .
|
jpayne@69
|
100 int Edit_Match_Limit [MAX_ERRORS] = {0};
|
jpayne@69
|
101 // This array [e] is the minimum value of Edit_Array [e] [d]
|
jpayne@69
|
102 // to be worth pursuing in edit-distance computations between guides
|
jpayne@69
|
103 int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS];
|
jpayne@69
|
104 // Memory used by alignment calculation
|
jpayne@69
|
105 int Error_Bound [MAX_FRAG_LEN + 1];
|
jpayne@69
|
106 // This array [i] is the maximum number of errors allowed
|
jpayne@69
|
107 // in a match between sequences of length i , which is
|
jpayne@69
|
108 // i * MAXERROR_RATE .
|
jpayne@69
|
109 char * Error_File_Name = DEFAULT_ERROR_FILE_NAME;
|
jpayne@69
|
110 // Name of file to write gaps listing with # errors in each gap
|
jpayne@69
|
111 int Fill_Ct = 0;
|
jpayne@69
|
112 // Number of non-acgt bases in ref sequence
|
jpayne@69
|
113 char * Gaps_File_Path = NULL;
|
jpayne@69
|
114 // Name of file produced by mgaps program
|
jpayne@69
|
115 char * Match_File_Path = NULL;
|
jpayne@69
|
116 // Name of multifasta file of sequences to compare against the reference
|
jpayne@69
|
117 // sequence
|
jpayne@69
|
118 int UserScoring = FALSE;
|
jpayne@69
|
119 // If TRUE, then user specified a percent ID cutoff and scoring
|
jpayne@69
|
120 // is adjusted to enforce this in extensions
|
jpayne@69
|
121 int Nucleotides_Only = FALSE;
|
jpayne@69
|
122 // If TRUE , then only acgt's can match
|
jpayne@69
|
123 bool Only_Difference_Positions = false;
|
jpayne@69
|
124 // If true , then output just positions of difference instead of
|
jpayne@69
|
125 // alignments between exact matches
|
jpayne@69
|
126 int Output_Cover_Files = TRUE;
|
jpayne@69
|
127 // If TRUE , output files showing coverage of each genome.
|
jpayne@69
|
128 double Percent_ID;
|
jpayne@69
|
129 // Desired identity rate of matches to be found
|
jpayne@69
|
130 // Expressed as a fraction, not really a percent
|
jpayne@69
|
131 char * Query = NULL;
|
jpayne@69
|
132 // The query sequence
|
jpayne@69
|
133 long int Query_Len;
|
jpayne@69
|
134 // The length of the query sequence
|
jpayne@69
|
135 char * Query_Suffix = "Query";
|
jpayne@69
|
136 // Suffix for query tag
|
jpayne@69
|
137 char * Ref = NULL;
|
jpayne@69
|
138 // The reference sequence
|
jpayne@69
|
139 char * Ref_File_Path = NULL;
|
jpayne@69
|
140 // Name of (single) fasta file of reference sequence
|
jpayne@69
|
141 long int Ref_Len;
|
jpayne@69
|
142 // The length of the reference sequence
|
jpayne@69
|
143 long int Ref_Size;
|
jpayne@69
|
144 // The size of the reference sequence buffer
|
jpayne@69
|
145 char * Ref_Suffix = "Ref";
|
jpayne@69
|
146 // Suffix for reference tag
|
jpayne@69
|
147 int Show_Differences = FALSE;
|
jpayne@69
|
148 // If TRUE then show differences in all alignments
|
jpayne@69
|
149 int Tag_From_Fasta_Line = FALSE;
|
jpayne@69
|
150 // If TRUE then use fasta tag from ref & query sequences as
|
jpayne@69
|
151 // 1st & 2nd column, resp., on match line to identify matches
|
jpayne@69
|
152 int Verbose = 0;
|
jpayne@69
|
153 // Controls printing of extra debuggin information
|
jpayne@69
|
154
|
jpayne@69
|
155
|
jpayne@69
|
156 bool Global_Debug_Flag = false;
|
jpayne@69
|
157
|
jpayne@69
|
158
|
jpayne@69
|
159
|
jpayne@69
|
160 // Functions
|
jpayne@69
|
161
|
jpayne@69
|
162 void Add_Coverage
|
jpayne@69
|
163 (Cover_t * * list, long int lo, long int hi);
|
jpayne@69
|
164 int Binomial_Bound
|
jpayne@69
|
165 (int, double, int, double);
|
jpayne@69
|
166 void Display_Alignment
|
jpayne@69
|
167 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
|
jpayne@69
|
168 void Display_Alignment_With_Pad
|
jpayne@69
|
169 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
170 char * left_pad [3], char * right_pad [3], int pad_len);
|
jpayne@69
|
171 void Display_Difference_Positions
|
jpayne@69
|
172 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
|
jpayne@69
|
173 int b_incr, int delta [], int delta_ct, const char * a_tag,
|
jpayne@69
|
174 const char * b_tag);
|
jpayne@69
|
175 int Extend_Backward
|
jpayne@69
|
176 (long int * ref_lo, long int * query_lo);
|
jpayne@69
|
177 int Extend_Forward
|
jpayne@69
|
178 (long int * ref_hi, long int * query_hi);
|
jpayne@69
|
179 void Initialize_Globals
|
jpayne@69
|
180 (void);
|
jpayne@69
|
181 FILE * Local_File_Open
|
jpayne@69
|
182 (const char * filename, const char * mode);
|
jpayne@69
|
183 int Max_int
|
jpayne@69
|
184 (int a, int b);
|
jpayne@69
|
185 int Min_int
|
jpayne@69
|
186 (int a, int b);
|
jpayne@69
|
187 void Parse_Command_Line
|
jpayne@69
|
188 (int argc, char * argv []);
|
jpayne@69
|
189 int Prefix_Edit_Dist
|
jpayne@69
|
190 (char A [], int m, char T [], int n, int Error_Limit,
|
jpayne@69
|
191 int * A_End, int * T_End, int * Match_To_End,
|
jpayne@69
|
192 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
|
jpayne@69
|
193 int Read_String
|
jpayne@69
|
194 (FILE * fp, char * * T, long int * Size, char header []);
|
jpayne@69
|
195 void Rev_Complement
|
jpayne@69
|
196 (char * s);
|
jpayne@69
|
197 void Rev_Display_Alignment
|
jpayne@69
|
198 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
|
jpayne@69
|
199 int Rev_Prefix_Edit_Dist
|
jpayne@69
|
200 (char A [], int m, char T [], int n, int Error_Limit,
|
jpayne@69
|
201 int * A_End, int * T_End, int * Match_To_End,
|
jpayne@69
|
202 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
|
jpayne@69
|
203 void Rev_Show_Diffs
|
jpayne@69
|
204 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
205 long int a_ref, long int b_ref);
|
jpayne@69
|
206 void Set_Deltas
|
jpayne@69
|
207 (int delta [], int * delta_len, int row, int d, int e);
|
jpayne@69
|
208 static void Set_Left_Pad
|
jpayne@69
|
209 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len);
|
jpayne@69
|
210 static void Set_Right_Pad
|
jpayne@69
|
211 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len);
|
jpayne@69
|
212 void Show_Coverage
|
jpayne@69
|
213 (Cover_t * list, char * filename, char * tag, char * seq);
|
jpayne@69
|
214 void Show_Diffs
|
jpayne@69
|
215 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
216 long int a_ref, long int b_ref);
|
jpayne@69
|
217 int Sign
|
jpayne@69
|
218 (int a);
|
jpayne@69
|
219 void Usage
|
jpayne@69
|
220 (char * command);
|
jpayne@69
|
221
|
jpayne@69
|
222
|
jpayne@69
|
223
|
jpayne@69
|
224 int main
|
jpayne@69
|
225 (int argc, char * argv [])
|
jpayne@69
|
226
|
jpayne@69
|
227 {
|
jpayne@69
|
228 FILE * fp, * match_fp, * error_fp;
|
jpayne@69
|
229 char ref_header [MAX_HDR_LEN];
|
jpayne@69
|
230 char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN];
|
jpayne@69
|
231 char line [MAX_LINE], filename [MAX_LINE];
|
jpayne@69
|
232 char * left_pad [3], * right_pad [3];
|
jpayne@69
|
233 long int ref_pos, ref_lo, ref_hi;
|
jpayne@69
|
234 long int query_pos, query_lo, query_hi, query_size;
|
jpayne@69
|
235 long int match_len, prior_match_len = 0, max_len;
|
jpayne@69
|
236 double error = 0.0;
|
jpayne@69
|
237 long int total_errors = 0;
|
jpayne@69
|
238 int match_ct = 0;
|
jpayne@69
|
239 double match_total = 0.0;
|
jpayne@69
|
240 int is_forward = FALSE;
|
jpayne@69
|
241 int first_match = TRUE;
|
jpayne@69
|
242 Cover_t * ref_cover_list = NULL;
|
jpayne@69
|
243 Cover_t * query_cover_list = NULL;
|
jpayne@69
|
244 char * p;
|
jpayne@69
|
245 int i;
|
jpayne@69
|
246
|
jpayne@69
|
247 Parse_Command_Line (argc, argv);
|
jpayne@69
|
248
|
jpayne@69
|
249 if(UserScoring){
|
jpayne@69
|
250 // percentID = mismatch penalty / (match bonus + mismatch penalty)
|
jpayne@69
|
251 // or, more compactly, p = mm / (m+mm)
|
jpayne@69
|
252 // Since we require that m+mm = 1, m=1-mm
|
jpayne@69
|
253 // and so we get the trivial mm = p
|
jpayne@69
|
254 //
|
jpayne@69
|
255 // This means that Branch_Pt_Error_Value = -p
|
jpayne@69
|
256 // and then Branch_Pt_Match_Value = 1-p
|
jpayne@69
|
257 //
|
jpayne@69
|
258 // Make it so:
|
jpayne@69
|
259 Branch_Pt_Error_Value = - Percent_ID;
|
jpayne@69
|
260 Branch_Pt_Match_Value = 1.0 - Percent_ID;
|
jpayne@69
|
261 }
|
jpayne@69
|
262
|
jpayne@69
|
263 Initialize_Globals ();
|
jpayne@69
|
264
|
jpayne@69
|
265 p = strrchr (Ref_File_Path, '/');
|
jpayne@69
|
266 if (p == NULL)
|
jpayne@69
|
267 strcpy (ref_tag, Ref_File_Path);
|
jpayne@69
|
268 else
|
jpayne@69
|
269 strcpy (ref_tag, p + 1);
|
jpayne@69
|
270 p = strrchr (ref_tag, '.');
|
jpayne@69
|
271 if (p != NULL)
|
jpayne@69
|
272 (* p) = '\0';
|
jpayne@69
|
273 strcat (ref_tag, Ref_Suffix);
|
jpayne@69
|
274
|
jpayne@69
|
275 p = strrchr (Match_File_Path, '/');
|
jpayne@69
|
276 if (p == NULL)
|
jpayne@69
|
277 strcpy (query_tag, Match_File_Path);
|
jpayne@69
|
278 else
|
jpayne@69
|
279 strcpy (query_tag, p + 1);
|
jpayne@69
|
280 p = strrchr (query_tag, '.');
|
jpayne@69
|
281 if (p != NULL)
|
jpayne@69
|
282 (* p) = '\0';
|
jpayne@69
|
283 strcat (query_tag, Query_Suffix);
|
jpayne@69
|
284
|
jpayne@69
|
285 fp = Local_File_Open (Ref_File_Path, "r");
|
jpayne@69
|
286 Ref_Size = 100000;
|
jpayne@69
|
287 Ref = (char *) Safe_malloc (Ref_Size);
|
jpayne@69
|
288
|
jpayne@69
|
289 assert (Read_String (fp, & Ref, & Ref_Size, ref_header));
|
jpayne@69
|
290 if (Tag_From_Fasta_Line)
|
jpayne@69
|
291 strcpy (ref_tag, strtok (ref_header, " \t\n>"));
|
jpayne@69
|
292 fclose (fp);
|
jpayne@69
|
293
|
jpayne@69
|
294 Ref_Len = strlen (Ref + 1);
|
jpayne@69
|
295 Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len);
|
jpayne@69
|
296 for (i = 1; i <= Ref_Len; i ++)
|
jpayne@69
|
297 switch (Ref [i])
|
jpayne@69
|
298 {
|
jpayne@69
|
299 case 'a' :
|
jpayne@69
|
300 case 'c' :
|
jpayne@69
|
301 case 'g' :
|
jpayne@69
|
302 case 't' :
|
jpayne@69
|
303 break;
|
jpayne@69
|
304 default :
|
jpayne@69
|
305 if (Nucleotides_Only)
|
jpayne@69
|
306 Ref [i] = '2';
|
jpayne@69
|
307 Fill_Ct ++;
|
jpayne@69
|
308 }
|
jpayne@69
|
309
|
jpayne@69
|
310 for (i = 0; i < 3; i ++)
|
jpayne@69
|
311 {
|
jpayne@69
|
312 left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
|
jpayne@69
|
313 right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
|
jpayne@69
|
314 }
|
jpayne@69
|
315
|
jpayne@69
|
316 fp = Local_File_Open (Gaps_File_Path, "r");
|
jpayne@69
|
317 match_fp = Local_File_Open (Match_File_Path, "r");
|
jpayne@69
|
318 query_size = 100000;
|
jpayne@69
|
319 Query = (char *) Safe_malloc (query_size);
|
jpayne@69
|
320
|
jpayne@69
|
321 error_fp = Local_File_Open (Error_File_Name, "w");
|
jpayne@69
|
322
|
jpayne@69
|
323 while (fgets (line, MAX_LINE, fp) != NULL)
|
jpayne@69
|
324 {
|
jpayne@69
|
325 int line_len = strlen (line);
|
jpayne@69
|
326
|
jpayne@69
|
327 if (line [0] == '>')
|
jpayne@69
|
328 {
|
jpayne@69
|
329 if (! first_match)
|
jpayne@69
|
330 {
|
jpayne@69
|
331 total_errors += Extend_Forward (& ref_hi, & query_hi);
|
jpayne@69
|
332 max_len = 1 + ref_hi - ref_lo;
|
jpayne@69
|
333 if (1 + abs (query_hi - query_lo) > max_len)
|
jpayne@69
|
334 max_len = 1 + abs (query_hi - query_lo);
|
jpayne@69
|
335 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
|
jpayne@69
|
336 if (! Only_Difference_Positions)
|
jpayne@69
|
337 printf
|
jpayne@69
|
338 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
|
jpayne@69
|
339 ref_lo, ref_hi,
|
jpayne@69
|
340 is_forward ? query_lo : 1 + Query_Len - query_lo,
|
jpayne@69
|
341 is_forward ? query_hi : 1 + Query_Len - query_hi,
|
jpayne@69
|
342 total_errors, max_len, 100.0 * error);
|
jpayne@69
|
343
|
jpayne@69
|
344 match_ct ++;
|
jpayne@69
|
345 match_total += 1 + ref_hi - ref_lo;
|
jpayne@69
|
346 total_errors = 0;
|
jpayne@69
|
347 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
|
jpayne@69
|
348 if (is_forward)
|
jpayne@69
|
349 Add_Coverage (& query_cover_list, query_lo, query_hi);
|
jpayne@69
|
350 else
|
jpayne@69
|
351 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
|
jpayne@69
|
352 1 + Query_Len - query_lo);
|
jpayne@69
|
353 }
|
jpayne@69
|
354 if (Tag_From_Fasta_Line)
|
jpayne@69
|
355 {
|
jpayne@69
|
356 char buff [MAX_LINE];
|
jpayne@69
|
357 char * p;
|
jpayne@69
|
358
|
jpayne@69
|
359 strcpy (buff, line + 1);
|
jpayne@69
|
360 p = strtok (buff, " >\t\n");
|
jpayne@69
|
361 if (strlen (p) > 0)
|
jpayne@69
|
362 strcpy (query_tag, p);
|
jpayne@69
|
363 else
|
jpayne@69
|
364 {
|
jpayne@69
|
365 fprintf (stderr, "No tag on line %s", line);
|
jpayne@69
|
366 strcpy (query_tag, "???");
|
jpayne@69
|
367 }
|
jpayne@69
|
368 }
|
jpayne@69
|
369 is_forward = ! is_forward;
|
jpayne@69
|
370 if (is_forward)
|
jpayne@69
|
371 {
|
jpayne@69
|
372 assert (Read_String (match_fp, & Query, & query_size, header));
|
jpayne@69
|
373 Query_Len = strlen (Query + 1);
|
jpayne@69
|
374 if (Nucleotides_Only)
|
jpayne@69
|
375 {
|
jpayne@69
|
376 for (i = 1; i <= Query_Len; i ++)
|
jpayne@69
|
377 switch (Query [i])
|
jpayne@69
|
378 {
|
jpayne@69
|
379 case 'a' :
|
jpayne@69
|
380 case 'c' :
|
jpayne@69
|
381 case 'g' :
|
jpayne@69
|
382 case 't' :
|
jpayne@69
|
383 break;
|
jpayne@69
|
384 default :
|
jpayne@69
|
385 Query [i] = '1';
|
jpayne@69
|
386 }
|
jpayne@69
|
387 }
|
jpayne@69
|
388 }
|
jpayne@69
|
389 else
|
jpayne@69
|
390 Rev_Complement (Query + 1);
|
jpayne@69
|
391 first_match = TRUE;
|
jpayne@69
|
392 printf ("%s", line);
|
jpayne@69
|
393 fprintf (error_fp, "%s", line);
|
jpayne@69
|
394 }
|
jpayne@69
|
395 else if (line [0] == '#')
|
jpayne@69
|
396 {
|
jpayne@69
|
397 if (! first_match)
|
jpayne@69
|
398 {
|
jpayne@69
|
399 total_errors += Extend_Forward (& ref_hi, & query_hi);
|
jpayne@69
|
400 max_len = 1 + ref_hi - ref_lo;
|
jpayne@69
|
401 if (1 + abs (query_hi - query_lo) > max_len)
|
jpayne@69
|
402 max_len = 1 + abs (query_hi - query_lo);
|
jpayne@69
|
403 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
|
jpayne@69
|
404 if (! Only_Difference_Positions)
|
jpayne@69
|
405 printf
|
jpayne@69
|
406 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
|
jpayne@69
|
407 ref_lo, ref_hi,
|
jpayne@69
|
408 is_forward ? query_lo : 1 + Query_Len - query_lo,
|
jpayne@69
|
409 is_forward ? query_hi : 1 + Query_Len - query_hi,
|
jpayne@69
|
410 total_errors, max_len, 100.0 * error);
|
jpayne@69
|
411
|
jpayne@69
|
412 match_ct ++;
|
jpayne@69
|
413 match_total += 1 + ref_hi - ref_lo;
|
jpayne@69
|
414 total_errors = 0;
|
jpayne@69
|
415 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
|
jpayne@69
|
416 if (is_forward)
|
jpayne@69
|
417 Add_Coverage (& query_cover_list, query_lo, query_hi);
|
jpayne@69
|
418 else
|
jpayne@69
|
419 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
|
jpayne@69
|
420 1 + Query_Len - query_lo);
|
jpayne@69
|
421 }
|
jpayne@69
|
422 first_match = TRUE;
|
jpayne@69
|
423 printf ("%s", line);
|
jpayne@69
|
424 fprintf (error_fp, "%s", line);
|
jpayne@69
|
425 }
|
jpayne@69
|
426 else
|
jpayne@69
|
427 {
|
jpayne@69
|
428 assert (sscanf (line, "%ld %ld %ld",
|
jpayne@69
|
429 & ref_pos, & query_pos, & match_len) == 3);
|
jpayne@69
|
430 if (first_match)
|
jpayne@69
|
431 {
|
jpayne@69
|
432 ref_lo = ref_pos;
|
jpayne@69
|
433 query_lo = query_pos;
|
jpayne@69
|
434 total_errors += Extend_Backward (& ref_lo, & query_lo);
|
jpayne@69
|
435 if (! Only_Difference_Positions)
|
jpayne@69
|
436 printf ("%s", line);
|
jpayne@69
|
437 if (line_len > 0)
|
jpayne@69
|
438 line [line_len - 1] = '\0';
|
jpayne@69
|
439 fprintf (error_fp, "%s %7s\n", line, "-");
|
jpayne@69
|
440 }
|
jpayne@69
|
441 else
|
jpayne@69
|
442 {
|
jpayne@69
|
443 int errors;
|
jpayne@69
|
444 int a_len, b_len;
|
jpayne@69
|
445 int a_end, b_end, match_to_end;
|
jpayne@69
|
446 int delta [MAX_ERRORS], delta_len;
|
jpayne@69
|
447
|
jpayne@69
|
448 a_len = ref_pos - ref_hi - 1;
|
jpayne@69
|
449 b_len = query_pos - query_hi - 1;
|
jpayne@69
|
450
|
jpayne@69
|
451 errors = Prefix_Edit_Dist
|
jpayne@69
|
452 (Ref + ref_hi + 1, a_len,
|
jpayne@69
|
453 Query + query_hi + 1, b_len,
|
jpayne@69
|
454 MAX_ERRORS - 1, & a_end, & b_end,
|
jpayne@69
|
455 & match_to_end, delta, & delta_len, FALSE);
|
jpayne@69
|
456 if (Show_Differences)
|
jpayne@69
|
457 Show_Diffs (Ref + ref_hi + 1, a_end,
|
jpayne@69
|
458 Query + query_hi + 1, b_end,
|
jpayne@69
|
459 delta, delta_len, ref_hi + 1, query_hi + 1);
|
jpayne@69
|
460
|
jpayne@69
|
461 if (! Only_Difference_Positions)
|
jpayne@69
|
462 printf ("%s", line);
|
jpayne@69
|
463
|
jpayne@69
|
464 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len,
|
jpayne@69
|
465 DEFAULT_PAD);
|
jpayne@69
|
466 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len,
|
jpayne@69
|
467 DEFAULT_PAD);
|
jpayne@69
|
468
|
jpayne@69
|
469 if (a_end == 0 && b_end == 0)
|
jpayne@69
|
470 {
|
jpayne@69
|
471 if (! Only_Difference_Positions)
|
jpayne@69
|
472 printf (" No extension\n");
|
jpayne@69
|
473
|
jpayne@69
|
474 if (line_len > 0)
|
jpayne@69
|
475 line [line_len - 1] = '\0';
|
jpayne@69
|
476 fprintf (error_fp, "%s %7s\n", line, "-");
|
jpayne@69
|
477 }
|
jpayne@69
|
478 else if (Only_Difference_Positions)
|
jpayne@69
|
479 {
|
jpayne@69
|
480 int q_start, b_incr;
|
jpayne@69
|
481
|
jpayne@69
|
482 if (is_forward)
|
jpayne@69
|
483 {
|
jpayne@69
|
484 q_start = query_hi + 1;
|
jpayne@69
|
485 b_incr = 1;
|
jpayne@69
|
486 }
|
jpayne@69
|
487 else
|
jpayne@69
|
488 {
|
jpayne@69
|
489 q_start = Query_Len - query_hi;
|
jpayne@69
|
490 b_incr = -1;
|
jpayne@69
|
491 }
|
jpayne@69
|
492 Display_Difference_Positions
|
jpayne@69
|
493 (Ref + ref_hi + 1, ref_hi + 1, a_end,
|
jpayne@69
|
494 Query + query_hi + 1, q_start, b_end,
|
jpayne@69
|
495 b_incr, delta, delta_len, ref_tag,
|
jpayne@69
|
496 query_tag);
|
jpayne@69
|
497 }
|
jpayne@69
|
498 else
|
jpayne@69
|
499 {
|
jpayne@69
|
500 printf (" Errors = %d\n", errors);
|
jpayne@69
|
501 if (! Only_Difference_Positions)
|
jpayne@69
|
502 Display_Alignment_With_Pad
|
jpayne@69
|
503 (Ref + ref_hi + 1, a_end,
|
jpayne@69
|
504 Query + query_hi + 1, b_end,
|
jpayne@69
|
505 delta, delta_len, left_pad, right_pad,
|
jpayne@69
|
506 DEFAULT_PAD);
|
jpayne@69
|
507 if (line_len > 0)
|
jpayne@69
|
508 line [line_len - 1] = '\0';
|
jpayne@69
|
509 fprintf (error_fp, "%s %7d\n", line, errors);
|
jpayne@69
|
510 }
|
jpayne@69
|
511
|
jpayne@69
|
512 total_errors += errors;
|
jpayne@69
|
513
|
jpayne@69
|
514 if (! match_to_end)
|
jpayne@69
|
515 {
|
jpayne@69
|
516 ref_hi += a_end;
|
jpayne@69
|
517 query_hi += b_end;
|
jpayne@69
|
518 max_len = 1 + ref_hi - ref_lo;
|
jpayne@69
|
519 if (1 + abs (query_hi - query_lo) > max_len)
|
jpayne@69
|
520 max_len = 1 + abs (query_hi - query_lo);
|
jpayne@69
|
521 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
|
jpayne@69
|
522 if (! Only_Difference_Positions)
|
jpayne@69
|
523 printf (
|
jpayne@69
|
524 "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
|
jpayne@69
|
525 ref_lo, ref_hi,
|
jpayne@69
|
526 is_forward ? query_lo : 1 + Query_Len - query_lo,
|
jpayne@69
|
527 is_forward ? query_hi : 1 + Query_Len - query_hi,
|
jpayne@69
|
528 total_errors, max_len, 100.0 * error);
|
jpayne@69
|
529
|
jpayne@69
|
530 match_ct ++;
|
jpayne@69
|
531 match_total += 1 + ref_hi - ref_lo;
|
jpayne@69
|
532 total_errors = 0;
|
jpayne@69
|
533 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
|
jpayne@69
|
534 if (is_forward)
|
jpayne@69
|
535 Add_Coverage (& query_cover_list, query_lo, query_hi);
|
jpayne@69
|
536 else
|
jpayne@69
|
537 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
|
jpayne@69
|
538 1 + Query_Len - query_lo);
|
jpayne@69
|
539 ref_lo = ref_pos;
|
jpayne@69
|
540 query_lo = query_pos;
|
jpayne@69
|
541 total_errors += Extend_Backward (& ref_lo, & query_lo);
|
jpayne@69
|
542 }
|
jpayne@69
|
543 }
|
jpayne@69
|
544
|
jpayne@69
|
545 ref_hi = ref_pos + match_len - 1;
|
jpayne@69
|
546 query_hi = query_pos + match_len - 1;
|
jpayne@69
|
547 prior_match_len = match_len;
|
jpayne@69
|
548 first_match = FALSE;
|
jpayne@69
|
549 }
|
jpayne@69
|
550 }
|
jpayne@69
|
551
|
jpayne@69
|
552 if (! first_match)
|
jpayne@69
|
553 {
|
jpayne@69
|
554 total_errors += Extend_Forward (& ref_hi, & query_hi);
|
jpayne@69
|
555 max_len = 1 + ref_hi - ref_lo;
|
jpayne@69
|
556 if (1 + abs (query_hi - query_lo) > max_len)
|
jpayne@69
|
557 max_len = 1 + abs (query_hi - query_lo);
|
jpayne@69
|
558 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
|
jpayne@69
|
559 if (! Only_Difference_Positions)
|
jpayne@69
|
560 printf
|
jpayne@69
|
561 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
|
jpayne@69
|
562 ref_lo, ref_hi,
|
jpayne@69
|
563 is_forward ? query_lo : 1 + Query_Len - query_lo,
|
jpayne@69
|
564 is_forward ? query_hi : 1 + Query_Len - query_hi,
|
jpayne@69
|
565 total_errors, max_len, 100.0 * error);
|
jpayne@69
|
566
|
jpayne@69
|
567 match_ct ++;
|
jpayne@69
|
568 match_total += 1 + ref_hi - ref_lo;
|
jpayne@69
|
569 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
|
jpayne@69
|
570 if (is_forward)
|
jpayne@69
|
571 Add_Coverage (& query_cover_list, query_lo, query_hi);
|
jpayne@69
|
572 else
|
jpayne@69
|
573 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
|
jpayne@69
|
574 1 + Query_Len - query_lo);
|
jpayne@69
|
575 }
|
jpayne@69
|
576
|
jpayne@69
|
577 fprintf (stderr, " Ref len = %ld\n", Ref_Len);
|
jpayne@69
|
578 fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct);
|
jpayne@69
|
579 fprintf (stderr, " Non acgt's = %d\n", Fill_Ct);
|
jpayne@69
|
580 fprintf (stderr, " Number of matches = %d\n", match_ct);
|
jpayne@69
|
581 fprintf (stderr, "Sum of match bases = %.0f\n", match_total);
|
jpayne@69
|
582 fprintf (stderr, " Avg match bases = %.0f\n",
|
jpayne@69
|
583 match_ct == 0 ? 0.0 : match_total / match_ct);
|
jpayne@69
|
584
|
jpayne@69
|
585 fclose (error_fp);
|
jpayne@69
|
586
|
jpayne@69
|
587 if (Output_Cover_Files)
|
jpayne@69
|
588 {
|
jpayne@69
|
589 strcpy (filename, ref_tag);
|
jpayne@69
|
590 strcat (filename, ".cover");
|
jpayne@69
|
591 Show_Coverage (ref_cover_list, filename, ref_tag, Ref);
|
jpayne@69
|
592
|
jpayne@69
|
593 strcpy (filename, query_tag);
|
jpayne@69
|
594 strcat (filename, ".cover");
|
jpayne@69
|
595 Rev_Complement (Query + 1);
|
jpayne@69
|
596 Show_Coverage (query_cover_list, filename, query_tag, Query);
|
jpayne@69
|
597 }
|
jpayne@69
|
598
|
jpayne@69
|
599 return 0;
|
jpayne@69
|
600 }
|
jpayne@69
|
601
|
jpayne@69
|
602
|
jpayne@69
|
603
|
jpayne@69
|
604 void Add_Coverage
|
jpayne@69
|
605 (Cover_t * * list, long int lo, long int hi)
|
jpayne@69
|
606
|
jpayne@69
|
607 // Add lo .. hi to list of regions covered in (* list) .
|
jpayne@69
|
608 // Combine nodes when appropriate.
|
jpayne@69
|
609
|
jpayne@69
|
610 {
|
jpayne@69
|
611 Cover_t * new_node, * p, * prev = NULL;
|
jpayne@69
|
612
|
jpayne@69
|
613 if ((* list) == NULL || hi + 1 < (* list) -> lo)
|
jpayne@69
|
614 {
|
jpayne@69
|
615 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
|
jpayne@69
|
616 new_node -> lo = lo;
|
jpayne@69
|
617 new_node -> hi = hi;
|
jpayne@69
|
618 new_node -> next = (* list);
|
jpayne@69
|
619 (* list) = new_node;
|
jpayne@69
|
620 return;
|
jpayne@69
|
621 }
|
jpayne@69
|
622
|
jpayne@69
|
623 for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next)
|
jpayne@69
|
624 prev = p;
|
jpayne@69
|
625
|
jpayne@69
|
626 if (p == NULL || hi + 1 < p -> lo)
|
jpayne@69
|
627 { // insert between or on end
|
jpayne@69
|
628 assert (prev != NULL);
|
jpayne@69
|
629 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
|
jpayne@69
|
630 new_node -> lo = lo;
|
jpayne@69
|
631 new_node -> hi = hi;
|
jpayne@69
|
632 new_node -> next = p;
|
jpayne@69
|
633 prev -> next = new_node;
|
jpayne@69
|
634 return;
|
jpayne@69
|
635 }
|
jpayne@69
|
636
|
jpayne@69
|
637 if (lo < p -> lo)
|
jpayne@69
|
638 p -> lo = lo;
|
jpayne@69
|
639 while (p -> next != NULL && hi + 1 >= p -> next -> lo)
|
jpayne@69
|
640 {
|
jpayne@69
|
641 Cover_t * save;
|
jpayne@69
|
642
|
jpayne@69
|
643 p -> hi = p -> next -> hi;
|
jpayne@69
|
644 save = p -> next;
|
jpayne@69
|
645 p -> next = save -> next;
|
jpayne@69
|
646 free (save);
|
jpayne@69
|
647 }
|
jpayne@69
|
648 if (hi > p -> hi)
|
jpayne@69
|
649 p -> hi = hi;
|
jpayne@69
|
650
|
jpayne@69
|
651 return;
|
jpayne@69
|
652 }
|
jpayne@69
|
653
|
jpayne@69
|
654
|
jpayne@69
|
655
|
jpayne@69
|
656 int Binomial_Bound
|
jpayne@69
|
657 (int e, double p, int Start, double Limit)
|
jpayne@69
|
658
|
jpayne@69
|
659 // Return the smallest n >= Start s.t.
|
jpayne@69
|
660 // prob [>= e errors in n binomial trials (p = error prob)]
|
jpayne@69
|
661 // > Limit
|
jpayne@69
|
662
|
jpayne@69
|
663 {
|
jpayne@69
|
664 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
|
jpayne@69
|
665 double q, Sum, P_Power, Q_Power, X;
|
jpayne@69
|
666 int k, n, Bin_Coeff, Ct;
|
jpayne@69
|
667
|
jpayne@69
|
668 q = 1.0 - p;
|
jpayne@69
|
669 if (Start < e)
|
jpayne@69
|
670 Start = e;
|
jpayne@69
|
671
|
jpayne@69
|
672 for (n = Start; n < MAX_FRAG_LEN; n ++)
|
jpayne@69
|
673 {
|
jpayne@69
|
674 if (n <= 35)
|
jpayne@69
|
675 {
|
jpayne@69
|
676 Sum = 0.0;
|
jpayne@69
|
677 Bin_Coeff = 1;
|
jpayne@69
|
678 Ct = 0;
|
jpayne@69
|
679 P_Power = 1.0;
|
jpayne@69
|
680 Q_Power = pow (q, (double) n);
|
jpayne@69
|
681
|
jpayne@69
|
682 for (k = 0; k < e && 1.0 - Sum > Limit; k ++)
|
jpayne@69
|
683 {
|
jpayne@69
|
684 X = Bin_Coeff * P_Power * Q_Power;
|
jpayne@69
|
685 Sum += X;
|
jpayne@69
|
686 Bin_Coeff *= n - Ct;
|
jpayne@69
|
687 Bin_Coeff /= ++ Ct;
|
jpayne@69
|
688 P_Power *= p;
|
jpayne@69
|
689 Q_Power /= q;
|
jpayne@69
|
690 }
|
jpayne@69
|
691 if (1.0 - Sum > Limit)
|
jpayne@69
|
692 return n;
|
jpayne@69
|
693 }
|
jpayne@69
|
694 else
|
jpayne@69
|
695 {
|
jpayne@69
|
696 Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q));
|
jpayne@69
|
697 if (Normal_Z <= NORMAL_DISTRIB_THOLD)
|
jpayne@69
|
698 return n;
|
jpayne@69
|
699 Sum = 0.0;
|
jpayne@69
|
700 Mu_Power = 1.0;
|
jpayne@69
|
701 Factorial = 1.0;
|
jpayne@69
|
702 Poisson_Coeff = exp (- (double) n * p);
|
jpayne@69
|
703 for (k = 0; k < e; k ++)
|
jpayne@69
|
704 {
|
jpayne@69
|
705 Sum += Mu_Power * Poisson_Coeff / Factorial;
|
jpayne@69
|
706 Mu_Power *= n * p;
|
jpayne@69
|
707 Factorial *= k + 1;
|
jpayne@69
|
708 }
|
jpayne@69
|
709 if (1.0 - Sum > Limit)
|
jpayne@69
|
710 return n;
|
jpayne@69
|
711 }
|
jpayne@69
|
712 }
|
jpayne@69
|
713
|
jpayne@69
|
714 return MAX_FRAG_LEN;
|
jpayne@69
|
715 }
|
jpayne@69
|
716
|
jpayne@69
|
717
|
jpayne@69
|
718
|
jpayne@69
|
719 #define DISPLAY_WIDTH 60
|
jpayne@69
|
720
|
jpayne@69
|
721 void Display_Alignment
|
jpayne@69
|
722 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
|
jpayne@69
|
723
|
jpayne@69
|
724 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
|
jpayne@69
|
725 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
|
jpayne@69
|
726
|
jpayne@69
|
727 {
|
jpayne@69
|
728 int i, j, k, m, top_len, bottom_len;
|
jpayne@69
|
729 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
730 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
731
|
jpayne@69
|
732 i = j = top_len = bottom_len = 0;
|
jpayne@69
|
733 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
734 {
|
jpayne@69
|
735 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
736 {
|
jpayne@69
|
737 top [top_len ++] = a [i ++];
|
jpayne@69
|
738 j ++;
|
jpayne@69
|
739 }
|
jpayne@69
|
740 if (delta [k] < 0)
|
jpayne@69
|
741 {
|
jpayne@69
|
742 top [top_len ++] = '-';
|
jpayne@69
|
743 j ++;
|
jpayne@69
|
744 }
|
jpayne@69
|
745 else
|
jpayne@69
|
746 {
|
jpayne@69
|
747 top [top_len ++] = a [i ++];
|
jpayne@69
|
748 }
|
jpayne@69
|
749 }
|
jpayne@69
|
750 while (i < a_len && j < b_len)
|
jpayne@69
|
751 {
|
jpayne@69
|
752 top [top_len ++] = a [i ++];
|
jpayne@69
|
753 j ++;
|
jpayne@69
|
754 }
|
jpayne@69
|
755 top [top_len] = '\0';
|
jpayne@69
|
756
|
jpayne@69
|
757
|
jpayne@69
|
758 i = j = 0;
|
jpayne@69
|
759 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
760 {
|
jpayne@69
|
761 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
762 {
|
jpayne@69
|
763 bottom [bottom_len ++] = b [j ++];
|
jpayne@69
|
764 i ++;
|
jpayne@69
|
765 }
|
jpayne@69
|
766 if (delta [k] > 0)
|
jpayne@69
|
767 {
|
jpayne@69
|
768 bottom [bottom_len ++] = '-';
|
jpayne@69
|
769 i ++;
|
jpayne@69
|
770 }
|
jpayne@69
|
771 else
|
jpayne@69
|
772 {
|
jpayne@69
|
773 bottom [bottom_len ++] = b [j ++];
|
jpayne@69
|
774 }
|
jpayne@69
|
775 }
|
jpayne@69
|
776 while (j < b_len && i < a_len)
|
jpayne@69
|
777 {
|
jpayne@69
|
778 bottom [bottom_len ++] = b [j ++];
|
jpayne@69
|
779 i ++;
|
jpayne@69
|
780 }
|
jpayne@69
|
781 bottom [bottom_len] = '\0';
|
jpayne@69
|
782
|
jpayne@69
|
783
|
jpayne@69
|
784 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
|
jpayne@69
|
785 {
|
jpayne@69
|
786 printf ("A: ");
|
jpayne@69
|
787 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
|
jpayne@69
|
788 putchar (top [i + j]);
|
jpayne@69
|
789 putchar ('\n');
|
jpayne@69
|
790 printf ("B: ");
|
jpayne@69
|
791 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
|
jpayne@69
|
792 putchar (bottom [i + j]);
|
jpayne@69
|
793 putchar ('\n');
|
jpayne@69
|
794 printf (" ");
|
jpayne@69
|
795 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
|
jpayne@69
|
796 j ++)
|
jpayne@69
|
797 if (top [i + j] != ' ' && bottom [i + j] != ' '
|
jpayne@69
|
798 && top [i + j] != bottom [i + j])
|
jpayne@69
|
799 putchar ('^');
|
jpayne@69
|
800 else
|
jpayne@69
|
801 putchar (' ');
|
jpayne@69
|
802 putchar ('\n');
|
jpayne@69
|
803 }
|
jpayne@69
|
804
|
jpayne@69
|
805 return;
|
jpayne@69
|
806 }
|
jpayne@69
|
807
|
jpayne@69
|
808
|
jpayne@69
|
809
|
jpayne@69
|
810 void Display_Alignment_With_Pad
|
jpayne@69
|
811 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
812 char * left_pad [3], char * right_pad [3], int pad_len)
|
jpayne@69
|
813
|
jpayne@69
|
814 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
|
jpayne@69
|
815 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
|
jpayne@69
|
816 // Attach pad strings in left_pad to the beginning of the alignment
|
jpayne@69
|
817 // and the strings in right_pad to the end of the alignment.
|
jpayne@69
|
818 // pad_len is the width of the pad strings
|
jpayne@69
|
819
|
jpayne@69
|
820 {
|
jpayne@69
|
821 int i, j, k, m, top_len, bottom_len;
|
jpayne@69
|
822 #if 0
|
jpayne@69
|
823 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
824 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
825 #else
|
jpayne@69
|
826 string top, bottom;
|
jpayne@69
|
827 #endif
|
jpayne@69
|
828
|
jpayne@69
|
829 for (k = 0; k < pad_len; k ++)
|
jpayne@69
|
830 {
|
jpayne@69
|
831 top . push_back (left_pad [0] [k]);
|
jpayne@69
|
832 bottom . push_back (left_pad [1] [k]);
|
jpayne@69
|
833 }
|
jpayne@69
|
834
|
jpayne@69
|
835 i = j = 0;
|
jpayne@69
|
836 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
837 {
|
jpayne@69
|
838 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
839 {
|
jpayne@69
|
840 top . push_back (a [i ++]);
|
jpayne@69
|
841 j ++;
|
jpayne@69
|
842 }
|
jpayne@69
|
843 if (delta [k] < 0)
|
jpayne@69
|
844 {
|
jpayne@69
|
845 top . push_back ('-');
|
jpayne@69
|
846 j ++;
|
jpayne@69
|
847 }
|
jpayne@69
|
848 else
|
jpayne@69
|
849 {
|
jpayne@69
|
850 top . push_back (a [i ++]);
|
jpayne@69
|
851 }
|
jpayne@69
|
852 }
|
jpayne@69
|
853 while (i < a_len && j < b_len)
|
jpayne@69
|
854 {
|
jpayne@69
|
855 top . push_back (a [i ++]);
|
jpayne@69
|
856 j ++;
|
jpayne@69
|
857 }
|
jpayne@69
|
858
|
jpayne@69
|
859 i = j = 0;
|
jpayne@69
|
860 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
861 {
|
jpayne@69
|
862 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
863 {
|
jpayne@69
|
864 bottom . push_back (b [j ++]);
|
jpayne@69
|
865 i ++;
|
jpayne@69
|
866 }
|
jpayne@69
|
867 if (delta [k] > 0)
|
jpayne@69
|
868 {
|
jpayne@69
|
869 bottom . push_back ('-');
|
jpayne@69
|
870 i ++;
|
jpayne@69
|
871 }
|
jpayne@69
|
872 else
|
jpayne@69
|
873 {
|
jpayne@69
|
874 bottom . push_back (b [j ++]);
|
jpayne@69
|
875 }
|
jpayne@69
|
876 }
|
jpayne@69
|
877 while (j < b_len && i < a_len)
|
jpayne@69
|
878 {
|
jpayne@69
|
879 bottom . push_back (b [j ++]);
|
jpayne@69
|
880 i ++;
|
jpayne@69
|
881 }
|
jpayne@69
|
882
|
jpayne@69
|
883 for (k = 0; k < pad_len; k ++)
|
jpayne@69
|
884 {
|
jpayne@69
|
885 top . push_back (right_pad [0] [k]);
|
jpayne@69
|
886 bottom . push_back (right_pad [1] [k]);
|
jpayne@69
|
887 }
|
jpayne@69
|
888
|
jpayne@69
|
889 top_len = top . length ();
|
jpayne@69
|
890 bottom_len = bottom . length ();
|
jpayne@69
|
891
|
jpayne@69
|
892 if (top_len != bottom_len)
|
jpayne@69
|
893 {
|
jpayne@69
|
894 fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n",
|
jpayne@69
|
895 top_len, bottom_len);
|
jpayne@69
|
896 exit (EXIT_FAILURE);
|
jpayne@69
|
897 }
|
jpayne@69
|
898
|
jpayne@69
|
899 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
|
jpayne@69
|
900 {
|
jpayne@69
|
901 printf ("A: ");
|
jpayne@69
|
902 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
|
jpayne@69
|
903 putchar (top [i + j]);
|
jpayne@69
|
904 putchar ('\n');
|
jpayne@69
|
905 printf ("B: ");
|
jpayne@69
|
906 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
|
jpayne@69
|
907 putchar (bottom [i + j]);
|
jpayne@69
|
908 putchar ('\n');
|
jpayne@69
|
909 printf (" ");
|
jpayne@69
|
910 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
|
jpayne@69
|
911 j ++)
|
jpayne@69
|
912 if (i + j < pad_len)
|
jpayne@69
|
913 putchar (left_pad [2] [i + j]);
|
jpayne@69
|
914 else if (top_len - i - j <= pad_len)
|
jpayne@69
|
915 putchar (right_pad [2] [i + j - (top_len - pad_len)]);
|
jpayne@69
|
916 else if (top [i + j] != ' ' && bottom [i + j] != ' '
|
jpayne@69
|
917 && top [i + j] != bottom [i + j])
|
jpayne@69
|
918 putchar ('^');
|
jpayne@69
|
919 else
|
jpayne@69
|
920 putchar (' ');
|
jpayne@69
|
921 putchar ('\n');
|
jpayne@69
|
922 }
|
jpayne@69
|
923
|
jpayne@69
|
924 return;
|
jpayne@69
|
925 }
|
jpayne@69
|
926
|
jpayne@69
|
927
|
jpayne@69
|
928
|
jpayne@69
|
929 void Display_Difference_Positions
|
jpayne@69
|
930 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
|
jpayne@69
|
931 int b_incr, int delta [], int delta_ct, const char * a_tag,
|
jpayne@69
|
932 const char * b_tag)
|
jpayne@69
|
933
|
jpayne@69
|
934 // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)]
|
jpayne@69
|
935 // between strings a [0 .. (a_len - 1)] and
|
jpayne@69
|
936 // b [0 .. (b_len - 1))] . The positions of the starts of the strings
|
jpayne@69
|
937 // are at a_start and b_start , respectively, and b_incr indicates
|
jpayne@69
|
938 // if the b postions increase or decrease. b_incr must be 1 or -1.
|
jpayne@69
|
939 // Use a_tag and b_tag to label the positions.
|
jpayne@69
|
940
|
jpayne@69
|
941 {
|
jpayne@69
|
942 int i, j, k, m;
|
jpayne@69
|
943
|
jpayne@69
|
944 assert (b_incr == 1 || b_incr == -1);
|
jpayne@69
|
945
|
jpayne@69
|
946 i = j = 0;
|
jpayne@69
|
947 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
948 {
|
jpayne@69
|
949 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
950 {
|
jpayne@69
|
951 if (a [i] != b [j])
|
jpayne@69
|
952 printf ("%-15s %-15s %8d %8d %c %c\n",
|
jpayne@69
|
953 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
|
jpayne@69
|
954 i ++;
|
jpayne@69
|
955 j ++;
|
jpayne@69
|
956 }
|
jpayne@69
|
957 if (delta [k] < 0)
|
jpayne@69
|
958 {
|
jpayne@69
|
959 printf ("%-15s %-15s %8d %8d %c %c\n",
|
jpayne@69
|
960 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]);
|
jpayne@69
|
961 j ++;
|
jpayne@69
|
962 }
|
jpayne@69
|
963 else
|
jpayne@69
|
964 {
|
jpayne@69
|
965 if (b_incr > 0)
|
jpayne@69
|
966 printf ("%-15s %-15s %8d %8d %c %c\n",
|
jpayne@69
|
967 a_tag, b_tag, a_start + i, b_start + j, a [i], '-');
|
jpayne@69
|
968 else
|
jpayne@69
|
969 printf ("%-15s %-15s %8d %8d %c %c\n",
|
jpayne@69
|
970 a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-');
|
jpayne@69
|
971 i ++;
|
jpayne@69
|
972 }
|
jpayne@69
|
973 }
|
jpayne@69
|
974 while (i < a_len && j < b_len)
|
jpayne@69
|
975 {
|
jpayne@69
|
976 if (a [i] != b [j])
|
jpayne@69
|
977 printf ("%-15s %-15s %8d %8d %c %c\n",
|
jpayne@69
|
978 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
|
jpayne@69
|
979 i ++;
|
jpayne@69
|
980 j ++;
|
jpayne@69
|
981 }
|
jpayne@69
|
982
|
jpayne@69
|
983 return;
|
jpayne@69
|
984 }
|
jpayne@69
|
985
|
jpayne@69
|
986
|
jpayne@69
|
987
|
jpayne@69
|
988 int Extend_Backward
|
jpayne@69
|
989 (long int * ref_lo, long int * query_lo)
|
jpayne@69
|
990
|
jpayne@69
|
991 // Do edit distance off end of match beginning at (* ref_lo) and
|
jpayne@69
|
992 // (* query_lo) in the reference and query sequences, resp.,
|
jpayne@69
|
993 // in the reverse direction.
|
jpayne@69
|
994 // Return the number of errors in the extension and set (* ref_hi)
|
jpayne@69
|
995 // and (* query_hi) to the end of the extension.
|
jpayne@69
|
996
|
jpayne@69
|
997 {
|
jpayne@69
|
998 int ct = 0, errors, sum = 0;
|
jpayne@69
|
999 int a_len, b_len;
|
jpayne@69
|
1000 int a_end, b_end, match_to_end;
|
jpayne@69
|
1001 int delta [MAX_ERRORS], delta_len;
|
jpayne@69
|
1002
|
jpayne@69
|
1003 do
|
jpayne@69
|
1004 {
|
jpayne@69
|
1005 a_len = (* ref_lo) - 1;
|
jpayne@69
|
1006 if (a_len > MAX_EXTENSION)
|
jpayne@69
|
1007 a_len = MAX_EXTENSION;
|
jpayne@69
|
1008 b_len = (* query_lo) - 1;
|
jpayne@69
|
1009 if (b_len > MAX_EXTENSION)
|
jpayne@69
|
1010 b_len = MAX_EXTENSION;
|
jpayne@69
|
1011
|
jpayne@69
|
1012 errors = Rev_Prefix_Edit_Dist
|
jpayne@69
|
1013 (Ref + (* ref_lo) - 1, a_len,
|
jpayne@69
|
1014 Query + (* query_lo) - 1, b_len,
|
jpayne@69
|
1015 MAX_ERRORS - 1, & a_end, & b_end,
|
jpayne@69
|
1016 & match_to_end, delta, & delta_len, TRUE);
|
jpayne@69
|
1017 if (Show_Differences)
|
jpayne@69
|
1018 Rev_Show_Diffs
|
jpayne@69
|
1019 (Ref + (* ref_lo - 1), a_end,
|
jpayne@69
|
1020 Query + (* query_lo - 1), b_end,
|
jpayne@69
|
1021 delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1);
|
jpayne@69
|
1022 if (Verbose > 0)
|
jpayne@69
|
1023 {
|
jpayne@69
|
1024 printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
|
jpayne@69
|
1025 ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len,
|
jpayne@69
|
1026 errors, match_to_end ? 'T' : 'F', a_end, b_end);
|
jpayne@69
|
1027 Rev_Display_Alignment
|
jpayne@69
|
1028 (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end,
|
jpayne@69
|
1029 delta, delta_len);
|
jpayne@69
|
1030 }
|
jpayne@69
|
1031
|
jpayne@69
|
1032 (* ref_lo) -= a_end;
|
jpayne@69
|
1033 (* query_lo) -= b_end;
|
jpayne@69
|
1034 sum += errors;
|
jpayne@69
|
1035 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
|
jpayne@69
|
1036
|
jpayne@69
|
1037 return sum;
|
jpayne@69
|
1038 }
|
jpayne@69
|
1039
|
jpayne@69
|
1040
|
jpayne@69
|
1041
|
jpayne@69
|
1042 int Extend_Forward
|
jpayne@69
|
1043 (long int * ref_hi, long int * query_hi)
|
jpayne@69
|
1044
|
jpayne@69
|
1045 // Do edit distance off end of match ending at (* ref_hi) and
|
jpayne@69
|
1046 // (* query_hi) in the reference and query sequences, resp.
|
jpayne@69
|
1047 // Return the number of errors in the extension and set (* ref_hi)
|
jpayne@69
|
1048 // and (* query_hi) to the end of the extension.
|
jpayne@69
|
1049
|
jpayne@69
|
1050 {
|
jpayne@69
|
1051 int ct = 0, errors, sum = 0;
|
jpayne@69
|
1052 int a_end, b_end, match_to_end;
|
jpayne@69
|
1053 int a_len, b_len;
|
jpayne@69
|
1054 int delta [MAX_ERRORS], delta_len;
|
jpayne@69
|
1055
|
jpayne@69
|
1056 do
|
jpayne@69
|
1057 {
|
jpayne@69
|
1058 a_len = Ref_Len - (* ref_hi);
|
jpayne@69
|
1059 if (a_len > MAX_EXTENSION)
|
jpayne@69
|
1060 a_len = MAX_EXTENSION;
|
jpayne@69
|
1061 b_len = Query_Len - (* query_hi);
|
jpayne@69
|
1062 if (b_len > MAX_EXTENSION)
|
jpayne@69
|
1063 b_len = MAX_EXTENSION;
|
jpayne@69
|
1064
|
jpayne@69
|
1065 errors = Prefix_Edit_Dist
|
jpayne@69
|
1066 (Ref + (* ref_hi) + 1, a_len,
|
jpayne@69
|
1067 Query + (* query_hi) + 1, b_len,
|
jpayne@69
|
1068 MAX_ERRORS - 1, & a_end, & b_end,
|
jpayne@69
|
1069 & match_to_end, delta, & delta_len, TRUE);
|
jpayne@69
|
1070 if (Show_Differences)
|
jpayne@69
|
1071 Show_Diffs (Ref + (* ref_hi) + 1, a_end,
|
jpayne@69
|
1072 Query + (* query_hi) + 1, b_end,
|
jpayne@69
|
1073 delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1);
|
jpayne@69
|
1074 if (Verbose > 0)
|
jpayne@69
|
1075 {
|
jpayne@69
|
1076 printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
|
jpayne@69
|
1077 ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len,
|
jpayne@69
|
1078 errors, match_to_end ? 'T' : 'F', a_end, b_end);
|
jpayne@69
|
1079 Display_Alignment (Ref + (* ref_hi) + 1, a_end,
|
jpayne@69
|
1080 Query + (* query_hi) + 1, b_end,
|
jpayne@69
|
1081 delta, delta_len);
|
jpayne@69
|
1082 }
|
jpayne@69
|
1083
|
jpayne@69
|
1084 (* ref_hi) += a_end;
|
jpayne@69
|
1085 (* query_hi) += b_end;
|
jpayne@69
|
1086 sum += errors;
|
jpayne@69
|
1087 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
|
jpayne@69
|
1088
|
jpayne@69
|
1089 return sum;
|
jpayne@69
|
1090 }
|
jpayne@69
|
1091
|
jpayne@69
|
1092
|
jpayne@69
|
1093 void Initialize_Globals
|
jpayne@69
|
1094 (void)
|
jpayne@69
|
1095
|
jpayne@69
|
1096 // Initialize global variables used in this program
|
jpayne@69
|
1097
|
jpayne@69
|
1098 {
|
jpayne@69
|
1099 int i, offset, del;
|
jpayne@69
|
1100 int e, start;
|
jpayne@69
|
1101
|
jpayne@69
|
1102 offset = 2;
|
jpayne@69
|
1103 del = 6;
|
jpayne@69
|
1104 for (i = 0; i < MAX_ERRORS; i ++)
|
jpayne@69
|
1105 {
|
jpayne@69
|
1106 Edit_Array [i] = Edit_Space + offset;
|
jpayne@69
|
1107 offset += del;
|
jpayne@69
|
1108 del += 2;
|
jpayne@69
|
1109 }
|
jpayne@69
|
1110
|
jpayne@69
|
1111
|
jpayne@69
|
1112 for (i = 0; i <= ERRORS_FOR_FREE; i ++)
|
jpayne@69
|
1113 Edit_Match_Limit [i] = 0;
|
jpayne@69
|
1114
|
jpayne@69
|
1115 start = 1;
|
jpayne@69
|
1116 for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++)
|
jpayne@69
|
1117 {
|
jpayne@69
|
1118 start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE,
|
jpayne@69
|
1119 start, EDIT_DIST_PROB_BOUND);
|
jpayne@69
|
1120 Edit_Match_Limit [e] = start - 1;
|
jpayne@69
|
1121 assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]);
|
jpayne@69
|
1122 }
|
jpayne@69
|
1123
|
jpayne@69
|
1124 for (i = 0; i <= MAX_FRAG_LEN; i ++)
|
jpayne@69
|
1125 Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE));
|
jpayne@69
|
1126
|
jpayne@69
|
1127 return;
|
jpayne@69
|
1128 }
|
jpayne@69
|
1129
|
jpayne@69
|
1130
|
jpayne@69
|
1131
|
jpayne@69
|
1132 FILE * Local_File_Open
|
jpayne@69
|
1133 (const char * filename, const char * mode)
|
jpayne@69
|
1134
|
jpayne@69
|
1135 /* Open Filename in Mode and return a pointer to its control
|
jpayne@69
|
1136 * block. If fail, print a message and exit. */
|
jpayne@69
|
1137
|
jpayne@69
|
1138 {
|
jpayne@69
|
1139 FILE * fp;
|
jpayne@69
|
1140
|
jpayne@69
|
1141 fp = fopen (filename, mode);
|
jpayne@69
|
1142 if (fp == NULL)
|
jpayne@69
|
1143 {
|
jpayne@69
|
1144 fprintf (stderr, "ERROR %d: Could not open file %s \n",
|
jpayne@69
|
1145 errno, filename);
|
jpayne@69
|
1146 perror (strerror (errno));
|
jpayne@69
|
1147 exit (EXIT_FAILURE);
|
jpayne@69
|
1148 }
|
jpayne@69
|
1149
|
jpayne@69
|
1150 return fp;
|
jpayne@69
|
1151 }
|
jpayne@69
|
1152
|
jpayne@69
|
1153
|
jpayne@69
|
1154
|
jpayne@69
|
1155 int Max_int
|
jpayne@69
|
1156 (int a, int b)
|
jpayne@69
|
1157
|
jpayne@69
|
1158 // Return the larger of a and b .
|
jpayne@69
|
1159
|
jpayne@69
|
1160 {
|
jpayne@69
|
1161 if (a < b)
|
jpayne@69
|
1162 return b;
|
jpayne@69
|
1163
|
jpayne@69
|
1164 return a;
|
jpayne@69
|
1165 }
|
jpayne@69
|
1166
|
jpayne@69
|
1167
|
jpayne@69
|
1168
|
jpayne@69
|
1169 int Min_int
|
jpayne@69
|
1170 (int a, int b)
|
jpayne@69
|
1171
|
jpayne@69
|
1172 // Return the smaller of a and b .
|
jpayne@69
|
1173
|
jpayne@69
|
1174 {
|
jpayne@69
|
1175 if (a < b)
|
jpayne@69
|
1176 return a;
|
jpayne@69
|
1177
|
jpayne@69
|
1178 return b;
|
jpayne@69
|
1179 }
|
jpayne@69
|
1180
|
jpayne@69
|
1181
|
jpayne@69
|
1182
|
jpayne@69
|
1183 void Parse_Command_Line
|
jpayne@69
|
1184 (int argc, char * argv [])
|
jpayne@69
|
1185
|
jpayne@69
|
1186 // Get options and parameters from command line with argc
|
jpayne@69
|
1187 // arguments in argv [0 .. (argc - 1)] .
|
jpayne@69
|
1188
|
jpayne@69
|
1189 {
|
jpayne@69
|
1190 int ch, errflg = FALSE;
|
jpayne@69
|
1191
|
jpayne@69
|
1192 optarg = NULL;
|
jpayne@69
|
1193
|
jpayne@69
|
1194 while (! errflg
|
jpayne@69
|
1195 && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF))
|
jpayne@69
|
1196 switch (ch)
|
jpayne@69
|
1197 {
|
jpayne@69
|
1198 case 'D' :
|
jpayne@69
|
1199 Only_Difference_Positions = true;
|
jpayne@69
|
1200 break;
|
jpayne@69
|
1201
|
jpayne@69
|
1202 case 'e' :
|
jpayne@69
|
1203 Percent_ID = 1.0 - atof (optarg);
|
jpayne@69
|
1204 UserScoring = TRUE;
|
jpayne@69
|
1205 break;
|
jpayne@69
|
1206
|
jpayne@69
|
1207 case 'n' :
|
jpayne@69
|
1208 Nucleotides_Only = TRUE;
|
jpayne@69
|
1209 break;
|
jpayne@69
|
1210
|
jpayne@69
|
1211 case 'N' :
|
jpayne@69
|
1212 Consec_Non_ACGT = strtol (optarg, NULL, 10);
|
jpayne@69
|
1213 break;
|
jpayne@69
|
1214
|
jpayne@69
|
1215 case 'q' :
|
jpayne@69
|
1216 Query_Suffix = optarg;
|
jpayne@69
|
1217 break;
|
jpayne@69
|
1218
|
jpayne@69
|
1219 case 'r' :
|
jpayne@69
|
1220 Ref_Suffix = optarg;
|
jpayne@69
|
1221 break;
|
jpayne@69
|
1222
|
jpayne@69
|
1223 case 'S' :
|
jpayne@69
|
1224 Show_Differences = TRUE;
|
jpayne@69
|
1225 break;
|
jpayne@69
|
1226
|
jpayne@69
|
1227 case 't' :
|
jpayne@69
|
1228 Tag_From_Fasta_Line = TRUE;
|
jpayne@69
|
1229 break;
|
jpayne@69
|
1230
|
jpayne@69
|
1231 case 'v' :
|
jpayne@69
|
1232 Verbose = strtol (optarg, NULL, 10);
|
jpayne@69
|
1233 break;
|
jpayne@69
|
1234
|
jpayne@69
|
1235 case 'x' :
|
jpayne@69
|
1236 Output_Cover_Files = FALSE;
|
jpayne@69
|
1237 break;
|
jpayne@69
|
1238
|
jpayne@69
|
1239 case 'W' :
|
jpayne@69
|
1240 Error_File_Name = optarg;
|
jpayne@69
|
1241 break;
|
jpayne@69
|
1242
|
jpayne@69
|
1243 case '?' :
|
jpayne@69
|
1244 fprintf (stderr, "Unrecognized option -%c\n", optopt);
|
jpayne@69
|
1245
|
jpayne@69
|
1246 default :
|
jpayne@69
|
1247 errflg = TRUE;
|
jpayne@69
|
1248 }
|
jpayne@69
|
1249
|
jpayne@69
|
1250 if (errflg || optind != argc - 3)
|
jpayne@69
|
1251 {
|
jpayne@69
|
1252 Usage (argv [0]);
|
jpayne@69
|
1253 exit (EXIT_FAILURE);
|
jpayne@69
|
1254 }
|
jpayne@69
|
1255
|
jpayne@69
|
1256 Ref_File_Path = argv [optind ++];
|
jpayne@69
|
1257
|
jpayne@69
|
1258 Match_File_Path = argv [optind ++];
|
jpayne@69
|
1259
|
jpayne@69
|
1260 Gaps_File_Path = argv [optind ++];
|
jpayne@69
|
1261
|
jpayne@69
|
1262 return;
|
jpayne@69
|
1263 }
|
jpayne@69
|
1264
|
jpayne@69
|
1265
|
jpayne@69
|
1266
|
jpayne@69
|
1267 int Prefix_Edit_Dist
|
jpayne@69
|
1268 (char A [], int m, char T [], int n, int Error_Limit,
|
jpayne@69
|
1269 int * A_End, int * T_End, int * Match_To_End,
|
jpayne@69
|
1270 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
|
jpayne@69
|
1271
|
jpayne@69
|
1272 // Return the minimum number of changes (inserts, deletes, replacements)
|
jpayne@69
|
1273 // needed to match string A [0 .. (m-1)] with a prefix of string
|
jpayne@69
|
1274 // T [0 .. (n-1)] if it's not more than Error_Limit .
|
jpayne@69
|
1275 // Put delta description of alignment in Delta and set
|
jpayne@69
|
1276 // (* Delta_Len) to the number of entries there if it's a complete
|
jpayne@69
|
1277 // match.
|
jpayne@69
|
1278 // Set A_End and T_End to the rightmost positions where the
|
jpayne@69
|
1279 // alignment ended in A and T , respectively.
|
jpayne@69
|
1280 // Set Match_To_End true if the match extended to the end
|
jpayne@69
|
1281 // of at least one string; otherwise, set it false to indicate
|
jpayne@69
|
1282 // a branch point.
|
jpayne@69
|
1283 // extending indicates whether the match is for a fixed region (FALSE)
|
jpayne@69
|
1284 // or is extending off the end of a match as far as possible (TRUE)
|
jpayne@69
|
1285
|
jpayne@69
|
1286 {
|
jpayne@69
|
1287 double Score, Max_Score;
|
jpayne@69
|
1288 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
|
jpayne@69
|
1289 int Best_d, Best_e, Longest, Row;
|
jpayne@69
|
1290 int Left, Right;
|
jpayne@69
|
1291 int d, e, i, j, shorter;
|
jpayne@69
|
1292
|
jpayne@69
|
1293 // assert (m <= n);
|
jpayne@69
|
1294 Best_d = Best_e = Longest = 0;
|
jpayne@69
|
1295 (* Delta_Len) = 0;
|
jpayne@69
|
1296
|
jpayne@69
|
1297 if (Consec_Non_ACGT > 0)
|
jpayne@69
|
1298 {
|
jpayne@69
|
1299 int ct;
|
jpayne@69
|
1300
|
jpayne@69
|
1301 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
|
jpayne@69
|
1302 {
|
jpayne@69
|
1303 switch (A [i])
|
jpayne@69
|
1304 {
|
jpayne@69
|
1305 case 'a' :
|
jpayne@69
|
1306 case 'c' :
|
jpayne@69
|
1307 case 'g' :
|
jpayne@69
|
1308 case 't' :
|
jpayne@69
|
1309 ct = 0;
|
jpayne@69
|
1310 break;
|
jpayne@69
|
1311 default :
|
jpayne@69
|
1312 ct ++;
|
jpayne@69
|
1313 }
|
jpayne@69
|
1314 }
|
jpayne@69
|
1315 if (ct >= Consec_Non_ACGT)
|
jpayne@69
|
1316 {
|
jpayne@69
|
1317 m = i - ct;
|
jpayne@69
|
1318 extending = TRUE;
|
jpayne@69
|
1319 }
|
jpayne@69
|
1320
|
jpayne@69
|
1321 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
|
jpayne@69
|
1322 {
|
jpayne@69
|
1323 switch (T [i])
|
jpayne@69
|
1324 {
|
jpayne@69
|
1325 case 'a' :
|
jpayne@69
|
1326 case 'c' :
|
jpayne@69
|
1327 case 'g' :
|
jpayne@69
|
1328 case 't' :
|
jpayne@69
|
1329 ct = 0;
|
jpayne@69
|
1330 break;
|
jpayne@69
|
1331 default :
|
jpayne@69
|
1332 ct ++;
|
jpayne@69
|
1333 }
|
jpayne@69
|
1334 }
|
jpayne@69
|
1335 if (ct >= Consec_Non_ACGT)
|
jpayne@69
|
1336 {
|
jpayne@69
|
1337 n = i - ct;
|
jpayne@69
|
1338 extending = TRUE;
|
jpayne@69
|
1339 }
|
jpayne@69
|
1340 }
|
jpayne@69
|
1341
|
jpayne@69
|
1342 shorter = Min_int (m, n);
|
jpayne@69
|
1343 for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++)
|
jpayne@69
|
1344 ;
|
jpayne@69
|
1345
|
jpayne@69
|
1346 Edit_Array [0] [0] = Row;
|
jpayne@69
|
1347
|
jpayne@69
|
1348 if (Row == m && Row == n) // Exact match
|
jpayne@69
|
1349 {
|
jpayne@69
|
1350 (* Delta_Len) = 0;
|
jpayne@69
|
1351 (* A_End) = (* T_End) = Row;
|
jpayne@69
|
1352 (* Match_To_End) = ! extending;
|
jpayne@69
|
1353 return 0;
|
jpayne@69
|
1354 }
|
jpayne@69
|
1355
|
jpayne@69
|
1356 Left = Right = 0;
|
jpayne@69
|
1357 Max_Score = Row * Branch_Pt_Match_Value;
|
jpayne@69
|
1358 Max_Score_Len = Row;
|
jpayne@69
|
1359 Max_Score_Best_d = 0;
|
jpayne@69
|
1360 Max_Score_Best_e = 0;
|
jpayne@69
|
1361
|
jpayne@69
|
1362 for (e = 1; e <= Error_Limit; e ++)
|
jpayne@69
|
1363 {
|
jpayne@69
|
1364 int cutoff;
|
jpayne@69
|
1365
|
jpayne@69
|
1366 Left = Max_int (Left - 1, -e);
|
jpayne@69
|
1367 Right = Min_int (Right + 1, e);
|
jpayne@69
|
1368 Edit_Array [e - 1] [Left] = -2;
|
jpayne@69
|
1369 Edit_Array [e - 1] [Left - 1] = -2;
|
jpayne@69
|
1370 Edit_Array [e - 1] [Right] = -2;
|
jpayne@69
|
1371 Edit_Array [e - 1] [Right + 1] = -2;
|
jpayne@69
|
1372
|
jpayne@69
|
1373 for (d = Left; d <= Right; d ++)
|
jpayne@69
|
1374 {
|
jpayne@69
|
1375 Row = 1 + Edit_Array [e - 1] [d];
|
jpayne@69
|
1376 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
|
jpayne@69
|
1377 Row = j;
|
jpayne@69
|
1378 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
|
jpayne@69
|
1379 Row = j;
|
jpayne@69
|
1380 while (Row < m && Row + d < n
|
jpayne@69
|
1381 && A [Row] == T [Row + d])
|
jpayne@69
|
1382 Row ++;
|
jpayne@69
|
1383
|
jpayne@69
|
1384 Edit_Array [e] [d] = Row;
|
jpayne@69
|
1385
|
jpayne@69
|
1386 if (Row == m && Row + d == n)
|
jpayne@69
|
1387 {
|
jpayne@69
|
1388 if (extending)
|
jpayne@69
|
1389 {
|
jpayne@69
|
1390 for (j = Left; j <= d; j ++)
|
jpayne@69
|
1391 if (Edit_Array [e] [j] > Longest)
|
jpayne@69
|
1392 {
|
jpayne@69
|
1393 Best_d = j;
|
jpayne@69
|
1394 Longest = Edit_Array [e] [j];
|
jpayne@69
|
1395 }
|
jpayne@69
|
1396 Score = Longest * Branch_Pt_Match_Value - e;
|
jpayne@69
|
1397 if (Score < Max_Score)
|
jpayne@69
|
1398 {
|
jpayne@69
|
1399 (* A_End) = Max_Score_Len;
|
jpayne@69
|
1400 (* T_End) = Max_Score_Len + Max_Score_Best_d;
|
jpayne@69
|
1401 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
|
jpayne@69
|
1402 Max_Score_Best_d, Max_Score_Best_e);
|
jpayne@69
|
1403 (* Match_To_End) = FALSE;
|
jpayne@69
|
1404 return Max_Score_Best_e;
|
jpayne@69
|
1405 }
|
jpayne@69
|
1406 }
|
jpayne@69
|
1407
|
jpayne@69
|
1408 (* A_End) = Row; // One past last align position
|
jpayne@69
|
1409 (* T_End) = Row + d;
|
jpayne@69
|
1410 Set_Deltas (Delta, Delta_Len, Row, d, e);
|
jpayne@69
|
1411 (* Match_To_End) = ! extending;
|
jpayne@69
|
1412 return e;
|
jpayne@69
|
1413 }
|
jpayne@69
|
1414 }
|
jpayne@69
|
1415
|
jpayne@69
|
1416 cutoff = Longest - GIVE_UP_LEN;
|
jpayne@69
|
1417 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
|
jpayne@69
|
1418 Left ++;
|
jpayne@69
|
1419 if (Left > Right)
|
jpayne@69
|
1420 break;
|
jpayne@69
|
1421 while (Right > Left && Edit_Array [e] [Right] < cutoff)
|
jpayne@69
|
1422 Right --;
|
jpayne@69
|
1423
|
jpayne@69
|
1424 assert (Left <= Right);
|
jpayne@69
|
1425
|
jpayne@69
|
1426 for (d = Left; d <= Right; d ++)
|
jpayne@69
|
1427 if (Edit_Array [e] [d] > Longest)
|
jpayne@69
|
1428 {
|
jpayne@69
|
1429 Best_d = d;
|
jpayne@69
|
1430 Best_e = e;
|
jpayne@69
|
1431 Longest = Edit_Array [e] [d];
|
jpayne@69
|
1432 }
|
jpayne@69
|
1433
|
jpayne@69
|
1434 Score = Longest * Branch_Pt_Match_Value - e;
|
jpayne@69
|
1435 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
|
jpayne@69
|
1436 if (Score > Max_Score)
|
jpayne@69
|
1437 {
|
jpayne@69
|
1438 Max_Score = Score;
|
jpayne@69
|
1439 Max_Score_Len = Longest;
|
jpayne@69
|
1440 Max_Score_Best_d = Best_d;
|
jpayne@69
|
1441 Max_Score_Best_e = Best_e;
|
jpayne@69
|
1442 }
|
jpayne@69
|
1443
|
jpayne@69
|
1444 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
|
jpayne@69
|
1445 break;
|
jpayne@69
|
1446 }
|
jpayne@69
|
1447
|
jpayne@69
|
1448 (* A_End) = Max_Score_Len;
|
jpayne@69
|
1449 (* T_End) = Max_Score_Len + Max_Score_Best_d;
|
jpayne@69
|
1450 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
|
jpayne@69
|
1451 (* Match_To_End) = FALSE;
|
jpayne@69
|
1452
|
jpayne@69
|
1453 return Max_Score_Best_e;
|
jpayne@69
|
1454 }
|
jpayne@69
|
1455
|
jpayne@69
|
1456
|
jpayne@69
|
1457
|
jpayne@69
|
1458 int Read_String
|
jpayne@69
|
1459 (FILE * fp, char * * T, long int * Size, char header [])
|
jpayne@69
|
1460
|
jpayne@69
|
1461 /* Read next string from fp (assuming FASTA format) into (* T) [1 ..]
|
jpayne@69
|
1462 * which has Size characters. Allocate extra memory if needed
|
jpayne@69
|
1463 * and adjust Size accordingly. Return TRUE if successful, FALSE
|
jpayne@69
|
1464 * otherwise (e.g., EOF). Set header to the contents of the FASTA
|
jpayne@69
|
1465 * header line. */
|
jpayne@69
|
1466
|
jpayne@69
|
1467 {
|
jpayne@69
|
1468 char Line [MAX_LINE];
|
jpayne@69
|
1469 long int Len;
|
jpayne@69
|
1470 int Ch, Ct;
|
jpayne@69
|
1471
|
jpayne@69
|
1472 if (feof (fp))
|
jpayne@69
|
1473 return FALSE;
|
jpayne@69
|
1474
|
jpayne@69
|
1475 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
|
jpayne@69
|
1476 ;
|
jpayne@69
|
1477
|
jpayne@69
|
1478 if (Ch != '>')
|
jpayne@69
|
1479 return FALSE;
|
jpayne@69
|
1480
|
jpayne@69
|
1481 fgets (Line, MAX_LINE, fp);
|
jpayne@69
|
1482 Len = strlen (Line);
|
jpayne@69
|
1483 assert (Len > 0 && Line [Len - 1] == '\n');
|
jpayne@69
|
1484 Line [Len - 1] = '\0';
|
jpayne@69
|
1485 strcpy (header, Line);
|
jpayne@69
|
1486
|
jpayne@69
|
1487
|
jpayne@69
|
1488 Ct = 0;
|
jpayne@69
|
1489 Len = 0;
|
jpayne@69
|
1490 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
|
jpayne@69
|
1491 {
|
jpayne@69
|
1492 if (isspace (Ch))
|
jpayne@69
|
1493 continue;
|
jpayne@69
|
1494
|
jpayne@69
|
1495 Ct ++;
|
jpayne@69
|
1496
|
jpayne@69
|
1497 if (Ct >= (* Size) - 1)
|
jpayne@69
|
1498 {
|
jpayne@69
|
1499 (* Size) = (long int) ((* Size) * EXPANSION_FACTOR);
|
jpayne@69
|
1500 (* T) = (char *) Safe_realloc ((* T), (* Size));
|
jpayne@69
|
1501 }
|
jpayne@69
|
1502 Ch = tolower (Ch);
|
jpayne@69
|
1503 if (! isalpha (Ch))
|
jpayne@69
|
1504 {
|
jpayne@69
|
1505 fprintf (stderr, "Unexpected character `%c\' in string %s\n",
|
jpayne@69
|
1506 Ch, header);
|
jpayne@69
|
1507 Ch = 'x';
|
jpayne@69
|
1508 }
|
jpayne@69
|
1509 (* T) [++ Len] = Ch;
|
jpayne@69
|
1510 }
|
jpayne@69
|
1511
|
jpayne@69
|
1512 (* T) [Len + 1] = '\0';
|
jpayne@69
|
1513 if (Ch == '>')
|
jpayne@69
|
1514 ungetc (Ch, fp);
|
jpayne@69
|
1515
|
jpayne@69
|
1516 return TRUE;
|
jpayne@69
|
1517 }
|
jpayne@69
|
1518
|
jpayne@69
|
1519
|
jpayne@69
|
1520
|
jpayne@69
|
1521 void Rev_Complement
|
jpayne@69
|
1522 (char * s)
|
jpayne@69
|
1523
|
jpayne@69
|
1524 /* Set string s to its DNA reverse complement. */
|
jpayne@69
|
1525
|
jpayne@69
|
1526 {
|
jpayne@69
|
1527 char ch;
|
jpayne@69
|
1528 int i, j, len;
|
jpayne@69
|
1529
|
jpayne@69
|
1530 len = strlen (s);
|
jpayne@69
|
1531
|
jpayne@69
|
1532 for (i = 0, j = len - 1; i < j; i ++, j --)
|
jpayne@69
|
1533 {
|
jpayne@69
|
1534 ch = Complement (s [i]);
|
jpayne@69
|
1535 s [i] = Complement (s [j]);
|
jpayne@69
|
1536 s [j] = ch;
|
jpayne@69
|
1537 }
|
jpayne@69
|
1538
|
jpayne@69
|
1539 if (i == j)
|
jpayne@69
|
1540 s [i] = Complement (s [i]);
|
jpayne@69
|
1541
|
jpayne@69
|
1542 return;
|
jpayne@69
|
1543 }
|
jpayne@69
|
1544
|
jpayne@69
|
1545
|
jpayne@69
|
1546
|
jpayne@69
|
1547 void Rev_Display_Alignment
|
jpayne@69
|
1548 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
|
jpayne@69
|
1549
|
jpayne@69
|
1550 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
|
jpayne@69
|
1551 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
|
jpayne@69
|
1552 // in the reverse direction.
|
jpayne@69
|
1553
|
jpayne@69
|
1554 {
|
jpayne@69
|
1555 int i, j, k, m, top_len, bottom_len;
|
jpayne@69
|
1556 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
1557 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
|
jpayne@69
|
1558
|
jpayne@69
|
1559 i = j = top_len = bottom_len = 0;
|
jpayne@69
|
1560 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
1561 {
|
jpayne@69
|
1562 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
1563 {
|
jpayne@69
|
1564 top [top_len ++] = a [- i ++];
|
jpayne@69
|
1565 j ++;
|
jpayne@69
|
1566 }
|
jpayne@69
|
1567 if (delta [k] < 0)
|
jpayne@69
|
1568 {
|
jpayne@69
|
1569 top [top_len ++] = '-';
|
jpayne@69
|
1570 j ++;
|
jpayne@69
|
1571 }
|
jpayne@69
|
1572 else
|
jpayne@69
|
1573 {
|
jpayne@69
|
1574 top [top_len ++] = a [- i ++];
|
jpayne@69
|
1575 }
|
jpayne@69
|
1576 }
|
jpayne@69
|
1577 while (i < a_len && j < b_len)
|
jpayne@69
|
1578 {
|
jpayne@69
|
1579 top [top_len ++] = a [- i ++];
|
jpayne@69
|
1580 j ++;
|
jpayne@69
|
1581 }
|
jpayne@69
|
1582 top [top_len] = '\0';
|
jpayne@69
|
1583
|
jpayne@69
|
1584
|
jpayne@69
|
1585 i = j = 0;
|
jpayne@69
|
1586 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
1587 {
|
jpayne@69
|
1588 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
1589 {
|
jpayne@69
|
1590 bottom [bottom_len ++] = b [- j ++];
|
jpayne@69
|
1591 i ++;
|
jpayne@69
|
1592 }
|
jpayne@69
|
1593 if (delta [k] > 0)
|
jpayne@69
|
1594 {
|
jpayne@69
|
1595 bottom [bottom_len ++] = '-';
|
jpayne@69
|
1596 i ++;
|
jpayne@69
|
1597 }
|
jpayne@69
|
1598 else
|
jpayne@69
|
1599 {
|
jpayne@69
|
1600 bottom [bottom_len ++] = b [- j ++];
|
jpayne@69
|
1601 }
|
jpayne@69
|
1602 }
|
jpayne@69
|
1603 while (j < b_len && i < a_len)
|
jpayne@69
|
1604 {
|
jpayne@69
|
1605 bottom [bottom_len ++] = b [- j ++];
|
jpayne@69
|
1606 i ++;
|
jpayne@69
|
1607 }
|
jpayne@69
|
1608 bottom [bottom_len] = '\0';
|
jpayne@69
|
1609
|
jpayne@69
|
1610
|
jpayne@69
|
1611 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
|
jpayne@69
|
1612 {
|
jpayne@69
|
1613 putchar ('\n');
|
jpayne@69
|
1614 printf ("A: ");
|
jpayne@69
|
1615 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
|
jpayne@69
|
1616 putchar (top [i + j]);
|
jpayne@69
|
1617 putchar ('\n');
|
jpayne@69
|
1618 printf ("B: ");
|
jpayne@69
|
1619 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
|
jpayne@69
|
1620 putchar (bottom [i + j]);
|
jpayne@69
|
1621 putchar ('\n');
|
jpayne@69
|
1622 printf (" ");
|
jpayne@69
|
1623 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
|
jpayne@69
|
1624 j ++)
|
jpayne@69
|
1625 if (top [i + j] != ' ' && bottom [i + j] != ' '
|
jpayne@69
|
1626 && top [i + j] != bottom [i + j])
|
jpayne@69
|
1627 putchar ('^');
|
jpayne@69
|
1628 else
|
jpayne@69
|
1629 putchar (' ');
|
jpayne@69
|
1630 putchar ('\n');
|
jpayne@69
|
1631 }
|
jpayne@69
|
1632
|
jpayne@69
|
1633 return;
|
jpayne@69
|
1634 }
|
jpayne@69
|
1635
|
jpayne@69
|
1636
|
jpayne@69
|
1637
|
jpayne@69
|
1638 int Rev_Prefix_Edit_Dist
|
jpayne@69
|
1639 (char A [], int m, char T [], int n, int Error_Limit,
|
jpayne@69
|
1640 int * A_End, int * T_End, int * Match_To_End,
|
jpayne@69
|
1641 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
|
jpayne@69
|
1642
|
jpayne@69
|
1643 // Return the minimum number of changes (inserts, deletes, replacements)
|
jpayne@69
|
1644 // needed to match string A [0 .. -(m-1)] with a prefix of string
|
jpayne@69
|
1645 // T [0 .. -(n-1)] if it's not more than Error_Limit .
|
jpayne@69
|
1646 // Note: Match is reverse direction, right to left.
|
jpayne@69
|
1647 // Put delta description of alignment in Delta and set
|
jpayne@69
|
1648 // (* Delta_Len) to the number of entries there if it's a complete
|
jpayne@69
|
1649 // match.
|
jpayne@69
|
1650 // Set A_End and T_End to the rightmost positions where the
|
jpayne@69
|
1651 // alignment ended in A and T , respectively.
|
jpayne@69
|
1652 // Set Match_To_End true if the match extended to the end
|
jpayne@69
|
1653 // of at least one string; otherwise, set it false to indicate
|
jpayne@69
|
1654 // a branch point.
|
jpayne@69
|
1655 // extending indicates whether the match is for a fixed region (FALSE)
|
jpayne@69
|
1656 // or is extending off the end of a match as far as possible (TRUE)
|
jpayne@69
|
1657
|
jpayne@69
|
1658 {
|
jpayne@69
|
1659 double Score, Max_Score;
|
jpayne@69
|
1660 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
|
jpayne@69
|
1661 int Best_d, Best_e, Longest, Row;
|
jpayne@69
|
1662 int Left, Right;
|
jpayne@69
|
1663 int d, e, i, j, shorter;
|
jpayne@69
|
1664
|
jpayne@69
|
1665 // assert (m <= n);
|
jpayne@69
|
1666 Best_d = Best_e = Longest = 0;
|
jpayne@69
|
1667 (* Delta_Len) = 0;
|
jpayne@69
|
1668
|
jpayne@69
|
1669 if (Consec_Non_ACGT > 0)
|
jpayne@69
|
1670 {
|
jpayne@69
|
1671 int ct;
|
jpayne@69
|
1672
|
jpayne@69
|
1673 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
|
jpayne@69
|
1674 {
|
jpayne@69
|
1675 switch (A [- i])
|
jpayne@69
|
1676 {
|
jpayne@69
|
1677 case 'a' :
|
jpayne@69
|
1678 case 'c' :
|
jpayne@69
|
1679 case 'g' :
|
jpayne@69
|
1680 case 't' :
|
jpayne@69
|
1681 ct = 0;
|
jpayne@69
|
1682 break;
|
jpayne@69
|
1683 default :
|
jpayne@69
|
1684 ct ++;
|
jpayne@69
|
1685 }
|
jpayne@69
|
1686 }
|
jpayne@69
|
1687 if (ct >= Consec_Non_ACGT)
|
jpayne@69
|
1688 {
|
jpayne@69
|
1689 m = i - ct;
|
jpayne@69
|
1690 extending = TRUE;
|
jpayne@69
|
1691 }
|
jpayne@69
|
1692
|
jpayne@69
|
1693 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
|
jpayne@69
|
1694 {
|
jpayne@69
|
1695 switch (T [- i])
|
jpayne@69
|
1696 {
|
jpayne@69
|
1697 case 'a' :
|
jpayne@69
|
1698 case 'c' :
|
jpayne@69
|
1699 case 'g' :
|
jpayne@69
|
1700 case 't' :
|
jpayne@69
|
1701 ct = 0;
|
jpayne@69
|
1702 break;
|
jpayne@69
|
1703 default :
|
jpayne@69
|
1704 ct ++;
|
jpayne@69
|
1705 }
|
jpayne@69
|
1706 }
|
jpayne@69
|
1707 if (ct >= Consec_Non_ACGT)
|
jpayne@69
|
1708 {
|
jpayne@69
|
1709 n = i - ct;
|
jpayne@69
|
1710 extending = TRUE;
|
jpayne@69
|
1711 }
|
jpayne@69
|
1712 }
|
jpayne@69
|
1713
|
jpayne@69
|
1714 shorter = Min_int (m, n);
|
jpayne@69
|
1715 for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++)
|
jpayne@69
|
1716 ;
|
jpayne@69
|
1717
|
jpayne@69
|
1718 Edit_Array [0] [0] = Row;
|
jpayne@69
|
1719
|
jpayne@69
|
1720 if (Row == m && Row == n) // Exact match
|
jpayne@69
|
1721 {
|
jpayne@69
|
1722 (* Delta_Len) = 0;
|
jpayne@69
|
1723 (* A_End) = (* T_End) = Row;
|
jpayne@69
|
1724 (* Match_To_End) = ! extending;
|
jpayne@69
|
1725 return 0;
|
jpayne@69
|
1726 }
|
jpayne@69
|
1727
|
jpayne@69
|
1728 Left = Right = 0;
|
jpayne@69
|
1729 Max_Score = Row * Branch_Pt_Match_Value;
|
jpayne@69
|
1730 Max_Score_Len = Row;
|
jpayne@69
|
1731 Max_Score_Best_d = 0;
|
jpayne@69
|
1732 Max_Score_Best_e = 0;
|
jpayne@69
|
1733
|
jpayne@69
|
1734 for (e = 1; e <= Error_Limit; e ++)
|
jpayne@69
|
1735 {
|
jpayne@69
|
1736 int cutoff;
|
jpayne@69
|
1737
|
jpayne@69
|
1738 Left = Max_int (Left - 1, -e);
|
jpayne@69
|
1739 Right = Min_int (Right + 1, e);
|
jpayne@69
|
1740 Edit_Array [e - 1] [Left] = -2;
|
jpayne@69
|
1741 Edit_Array [e - 1] [Left - 1] = -2;
|
jpayne@69
|
1742 Edit_Array [e - 1] [Right] = -2;
|
jpayne@69
|
1743 Edit_Array [e - 1] [Right + 1] = -2;
|
jpayne@69
|
1744
|
jpayne@69
|
1745 for (d = Left; d <= Right; d ++)
|
jpayne@69
|
1746 {
|
jpayne@69
|
1747 Row = 1 + Edit_Array [e - 1] [d];
|
jpayne@69
|
1748 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
|
jpayne@69
|
1749 Row = j;
|
jpayne@69
|
1750 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
|
jpayne@69
|
1751 Row = j;
|
jpayne@69
|
1752 while (Row < m && Row + d < n
|
jpayne@69
|
1753 && A [- Row] == T [- Row - d])
|
jpayne@69
|
1754 Row ++;
|
jpayne@69
|
1755
|
jpayne@69
|
1756 Edit_Array [e] [d] = Row;
|
jpayne@69
|
1757
|
jpayne@69
|
1758 if (Row == m && Row + d == n)
|
jpayne@69
|
1759 {
|
jpayne@69
|
1760 if (extending)
|
jpayne@69
|
1761 {
|
jpayne@69
|
1762 for (j = Left; j <= d; j ++)
|
jpayne@69
|
1763 if (Edit_Array [e] [j] > Longest)
|
jpayne@69
|
1764 {
|
jpayne@69
|
1765 Best_d = j;
|
jpayne@69
|
1766 Longest = Edit_Array [e] [j];
|
jpayne@69
|
1767 }
|
jpayne@69
|
1768 Score = Longest * Branch_Pt_Match_Value - e;
|
jpayne@69
|
1769 if (Score < Max_Score)
|
jpayne@69
|
1770 {
|
jpayne@69
|
1771 (* A_End) = Max_Score_Len;
|
jpayne@69
|
1772 (* T_End) = Max_Score_Len + Max_Score_Best_d;
|
jpayne@69
|
1773 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
|
jpayne@69
|
1774 Max_Score_Best_d, Max_Score_Best_e);
|
jpayne@69
|
1775 (* Match_To_End) = FALSE;
|
jpayne@69
|
1776 return Max_Score_Best_e;
|
jpayne@69
|
1777 }
|
jpayne@69
|
1778 }
|
jpayne@69
|
1779
|
jpayne@69
|
1780 (* A_End) = Row; // One past last align position
|
jpayne@69
|
1781 (* T_End) = Row + d;
|
jpayne@69
|
1782 Set_Deltas (Delta, Delta_Len, Row, d, e);
|
jpayne@69
|
1783 (* Match_To_End) = ! extending;
|
jpayne@69
|
1784 return e;
|
jpayne@69
|
1785 }
|
jpayne@69
|
1786 }
|
jpayne@69
|
1787
|
jpayne@69
|
1788 cutoff = Longest - GIVE_UP_LEN;
|
jpayne@69
|
1789 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
|
jpayne@69
|
1790 Left ++;
|
jpayne@69
|
1791 if (Left > Right)
|
jpayne@69
|
1792 break;
|
jpayne@69
|
1793 while (Right > Left && Edit_Array [e] [Right] < cutoff)
|
jpayne@69
|
1794 Right --;
|
jpayne@69
|
1795 assert (Left <= Right);
|
jpayne@69
|
1796
|
jpayne@69
|
1797 for (d = Left; d <= Right; d ++)
|
jpayne@69
|
1798 if (Edit_Array [e] [d] > Longest)
|
jpayne@69
|
1799 {
|
jpayne@69
|
1800 Best_d = d;
|
jpayne@69
|
1801 Best_e = e;
|
jpayne@69
|
1802 Longest = Edit_Array [e] [d];
|
jpayne@69
|
1803 }
|
jpayne@69
|
1804
|
jpayne@69
|
1805 Score = Longest * Branch_Pt_Match_Value - e;
|
jpayne@69
|
1806 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
|
jpayne@69
|
1807 if (Score > Max_Score)
|
jpayne@69
|
1808 {
|
jpayne@69
|
1809 Max_Score = Score;
|
jpayne@69
|
1810 Max_Score_Len = Longest;
|
jpayne@69
|
1811 Max_Score_Best_d = Best_d;
|
jpayne@69
|
1812 Max_Score_Best_e = Best_e;
|
jpayne@69
|
1813 }
|
jpayne@69
|
1814
|
jpayne@69
|
1815
|
jpayne@69
|
1816 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
|
jpayne@69
|
1817 break;
|
jpayne@69
|
1818 }
|
jpayne@69
|
1819
|
jpayne@69
|
1820 (* A_End) = Max_Score_Len;
|
jpayne@69
|
1821 (* T_End) = Max_Score_Len + Max_Score_Best_d;
|
jpayne@69
|
1822 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
|
jpayne@69
|
1823 (* Match_To_End) = FALSE;
|
jpayne@69
|
1824
|
jpayne@69
|
1825 return Max_Score_Best_e;
|
jpayne@69
|
1826 }
|
jpayne@69
|
1827
|
jpayne@69
|
1828
|
jpayne@69
|
1829
|
jpayne@69
|
1830 void Rev_Show_Diffs
|
jpayne@69
|
1831 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
1832 long int a_ref, long int b_ref)
|
jpayne@69
|
1833
|
jpayne@69
|
1834 // Show (to stdout ) the differences
|
jpayne@69
|
1835 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
|
jpayne@69
|
1836 // encoded in delta [0 .. (delta_ct - 1)]
|
jpayne@69
|
1837 // in the reverse direction. a_ref is the position of the beginning
|
jpayne@69
|
1838 // of string a . b_ref is the offset to the start of
|
jpayne@69
|
1839 // string b .
|
jpayne@69
|
1840
|
jpayne@69
|
1841 {
|
jpayne@69
|
1842 int i, j, k, m;
|
jpayne@69
|
1843
|
jpayne@69
|
1844 i = j = 0;
|
jpayne@69
|
1845 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
1846 {
|
jpayne@69
|
1847 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
1848 {
|
jpayne@69
|
1849 if (a [- i] != b [- j])
|
jpayne@69
|
1850 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
|
jpayne@69
|
1851 i ++;
|
jpayne@69
|
1852 j ++;
|
jpayne@69
|
1853 }
|
jpayne@69
|
1854 if (delta [k] < 0)
|
jpayne@69
|
1855 {
|
jpayne@69
|
1856 printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]);
|
jpayne@69
|
1857 j ++;
|
jpayne@69
|
1858 }
|
jpayne@69
|
1859 else
|
jpayne@69
|
1860 {
|
jpayne@69
|
1861 printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]);
|
jpayne@69
|
1862 i ++;
|
jpayne@69
|
1863 }
|
jpayne@69
|
1864 }
|
jpayne@69
|
1865 while (i < a_len && j < b_len)
|
jpayne@69
|
1866 {
|
jpayne@69
|
1867 if (a [- i] != b [- j])
|
jpayne@69
|
1868 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
|
jpayne@69
|
1869 i ++;
|
jpayne@69
|
1870 j ++;
|
jpayne@69
|
1871 }
|
jpayne@69
|
1872
|
jpayne@69
|
1873 return;
|
jpayne@69
|
1874 }
|
jpayne@69
|
1875
|
jpayne@69
|
1876
|
jpayne@69
|
1877
|
jpayne@69
|
1878 void Set_Deltas
|
jpayne@69
|
1879 (int delta [], int * delta_len, int row, int d, int e)
|
jpayne@69
|
1880
|
jpayne@69
|
1881 // Set delta to values that show the alignment recorded in
|
jpayne@69
|
1882 // global Edit_Array . Set (* delta_len) to the number of
|
jpayne@69
|
1883 // entries set. row is the length of the match in the first
|
jpayne@69
|
1884 // string. d is the offset at the end of the match between
|
jpayne@69
|
1885 // the first and second string. e is the number of errors.
|
jpayne@69
|
1886
|
jpayne@69
|
1887 {
|
jpayne@69
|
1888 int delta_stack [MAX_ERRORS];
|
jpayne@69
|
1889 int from, last, max;
|
jpayne@69
|
1890 int i, j, k;
|
jpayne@69
|
1891
|
jpayne@69
|
1892 last = row;
|
jpayne@69
|
1893 (* delta_len) = 0;
|
jpayne@69
|
1894
|
jpayne@69
|
1895 for (k = e; k > 0; k --)
|
jpayne@69
|
1896 {
|
jpayne@69
|
1897 from = d;
|
jpayne@69
|
1898 max = 1 + Edit_Array [k - 1] [d];
|
jpayne@69
|
1899 if ((j = Edit_Array [k - 1] [d - 1]) > max)
|
jpayne@69
|
1900 {
|
jpayne@69
|
1901 from = d - 1;
|
jpayne@69
|
1902 max = j;
|
jpayne@69
|
1903 }
|
jpayne@69
|
1904 if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max)
|
jpayne@69
|
1905 {
|
jpayne@69
|
1906 from = d + 1;
|
jpayne@69
|
1907 max = j;
|
jpayne@69
|
1908 }
|
jpayne@69
|
1909 if (from == d - 1)
|
jpayne@69
|
1910 {
|
jpayne@69
|
1911 delta_stack [(* delta_len) ++] = max - last - 1;
|
jpayne@69
|
1912 d --;
|
jpayne@69
|
1913 last = Edit_Array [k - 1] [from];
|
jpayne@69
|
1914 }
|
jpayne@69
|
1915 else if (from == d + 1)
|
jpayne@69
|
1916 {
|
jpayne@69
|
1917 delta_stack [(* delta_len) ++] = last - (max - 1);
|
jpayne@69
|
1918 d ++;
|
jpayne@69
|
1919 last = Edit_Array [k - 1] [from];
|
jpayne@69
|
1920 }
|
jpayne@69
|
1921 }
|
jpayne@69
|
1922 delta_stack [(* delta_len) ++] = last + 1;
|
jpayne@69
|
1923
|
jpayne@69
|
1924 k = 0;
|
jpayne@69
|
1925 for (i = (* delta_len) - 1; i > 0; i --)
|
jpayne@69
|
1926 delta [k ++]
|
jpayne@69
|
1927 = abs (delta_stack [i]) * Sign (delta_stack [i - 1]);
|
jpayne@69
|
1928 (* delta_len) --;
|
jpayne@69
|
1929
|
jpayne@69
|
1930 return;
|
jpayne@69
|
1931 }
|
jpayne@69
|
1932
|
jpayne@69
|
1933
|
jpayne@69
|
1934
|
jpayne@69
|
1935 static void Set_Left_Pad
|
jpayne@69
|
1936 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len)
|
jpayne@69
|
1937
|
jpayne@69
|
1938 // Set pad to a triple of strings, each of length pad_len ,
|
jpayne@69
|
1939 // representing a MUM of length mum_len ending at r_hi in Ref
|
jpayne@69
|
1940 // and q_hi in Query . Each string in pad must have been allocated
|
jpayne@69
|
1941 // at least pad_len + 1 bytes of memory. The purpose is to serve as
|
jpayne@69
|
1942 // the left-end padding for an alignment with this MUM as its left
|
jpayne@69
|
1943 // boundary.
|
jpayne@69
|
1944
|
jpayne@69
|
1945 {
|
jpayne@69
|
1946 int i;
|
jpayne@69
|
1947
|
jpayne@69
|
1948 for (i = 0; i < 3; i ++)
|
jpayne@69
|
1949 pad [i] [pad_len] = '\0';
|
jpayne@69
|
1950
|
jpayne@69
|
1951 for (i = 0; i < pad_len; i ++)
|
jpayne@69
|
1952 {
|
jpayne@69
|
1953 pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.';
|
jpayne@69
|
1954 pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.';
|
jpayne@69
|
1955 pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' ';
|
jpayne@69
|
1956 }
|
jpayne@69
|
1957
|
jpayne@69
|
1958 return;
|
jpayne@69
|
1959 }
|
jpayne@69
|
1960
|
jpayne@69
|
1961
|
jpayne@69
|
1962
|
jpayne@69
|
1963 static void Set_Right_Pad
|
jpayne@69
|
1964 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len)
|
jpayne@69
|
1965
|
jpayne@69
|
1966 // Set pad to a triple of strings, each of length pad_len ,
|
jpayne@69
|
1967 // representing a MUM of length mum_len beginning at r_lo in Ref
|
jpayne@69
|
1968 // and q_lo in Query . Each string in pad must have been allocated
|
jpayne@69
|
1969 // at least pad_len + 1 bytes of memory. The purpose is to serve as
|
jpayne@69
|
1970 // the right-end padding for an alignment with this MUM as its right
|
jpayne@69
|
1971 // boundary.
|
jpayne@69
|
1972
|
jpayne@69
|
1973 {
|
jpayne@69
|
1974 int i;
|
jpayne@69
|
1975
|
jpayne@69
|
1976 for (i = 0; i < 3; i ++)
|
jpayne@69
|
1977 pad [i] [pad_len] = '\0';
|
jpayne@69
|
1978
|
jpayne@69
|
1979 for (i = 0; i < pad_len; i ++)
|
jpayne@69
|
1980 {
|
jpayne@69
|
1981 pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.';
|
jpayne@69
|
1982 pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.';
|
jpayne@69
|
1983 pad [2] [i] = i < mum_len ? '=' : ' ';
|
jpayne@69
|
1984 }
|
jpayne@69
|
1985
|
jpayne@69
|
1986 return;
|
jpayne@69
|
1987 }
|
jpayne@69
|
1988
|
jpayne@69
|
1989
|
jpayne@69
|
1990
|
jpayne@69
|
1991 void Show_Coverage
|
jpayne@69
|
1992 (Cover_t * list, char * filename, char * tag, char * seq)
|
jpayne@69
|
1993
|
jpayne@69
|
1994 // Print to stderr summary stats of regions in list .
|
jpayne@69
|
1995 // Send to filename a list of region info suitable for celagram .
|
jpayne@69
|
1996 // Use tag to print the headings. Regions refer to seq .
|
jpayne@69
|
1997
|
jpayne@69
|
1998 {
|
jpayne@69
|
1999 FILE * fp;
|
jpayne@69
|
2000 Cover_t * p;
|
jpayne@69
|
2001 int ct = 0;
|
jpayne@69
|
2002 long int i, prev_hi = 0, n_ct, len, seq_len;
|
jpayne@69
|
2003 long int cov_ns = 0, gap_ns = 0, total_ns;
|
jpayne@69
|
2004 double cov_total = 0.0, gap_total = 0.0;
|
jpayne@69
|
2005
|
jpayne@69
|
2006 fp = Local_File_Open (filename, "w");
|
jpayne@69
|
2007 fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n",
|
jpayne@69
|
2008 "Region", "Start", "End", "Len", "N's", "%N's");
|
jpayne@69
|
2009
|
jpayne@69
|
2010 for (p = list; p != NULL; p = p -> next)
|
jpayne@69
|
2011 {
|
jpayne@69
|
2012 n_ct = 0;
|
jpayne@69
|
2013 for (i = prev_hi + 1; i < p -> lo; i ++)
|
jpayne@69
|
2014 if (strchr ("acgt", seq [i]) == NULL)
|
jpayne@69
|
2015 n_ct ++;
|
jpayne@69
|
2016 len = p -> lo - prev_hi - 1;
|
jpayne@69
|
2017 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
|
jpayne@69
|
2018 ct, prev_hi + 1, p -> lo - 1, len, n_ct,
|
jpayne@69
|
2019 len == 0 ? 0.0 : 100.0 * n_ct / len);
|
jpayne@69
|
2020 gap_total += len;
|
jpayne@69
|
2021 gap_ns += n_ct;
|
jpayne@69
|
2022
|
jpayne@69
|
2023 ct ++;
|
jpayne@69
|
2024
|
jpayne@69
|
2025 n_ct = 0;
|
jpayne@69
|
2026 for (i = p -> lo; i <= p -> hi; i ++)
|
jpayne@69
|
2027 if (strchr ("acgt", seq [i]) == NULL)
|
jpayne@69
|
2028 n_ct ++;
|
jpayne@69
|
2029 len = 1 + p -> hi - p -> lo;
|
jpayne@69
|
2030 fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
|
jpayne@69
|
2031 ct, p -> lo, p -> hi, len, n_ct,
|
jpayne@69
|
2032 len == 0 ? 0.0 : 100.0 * n_ct / len);
|
jpayne@69
|
2033 cov_total += len;
|
jpayne@69
|
2034 cov_ns += n_ct;
|
jpayne@69
|
2035
|
jpayne@69
|
2036 prev_hi = p -> hi;
|
jpayne@69
|
2037 }
|
jpayne@69
|
2038
|
jpayne@69
|
2039 n_ct = 0;
|
jpayne@69
|
2040 seq_len = strlen (seq + 1);
|
jpayne@69
|
2041 for (i = prev_hi + 1; i <= seq_len; i ++)
|
jpayne@69
|
2042 if (strchr ("acgt", seq [i]) == NULL)
|
jpayne@69
|
2043 n_ct ++;
|
jpayne@69
|
2044 len = seq_len - prev_hi;
|
jpayne@69
|
2045 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
|
jpayne@69
|
2046 ct, prev_hi + 1, seq_len, len, n_ct,
|
jpayne@69
|
2047 len == 0 ? 0.0 : 100.0 * n_ct / len);
|
jpayne@69
|
2048 gap_total += len;
|
jpayne@69
|
2049 gap_ns += n_ct;
|
jpayne@69
|
2050
|
jpayne@69
|
2051 total_ns = cov_ns + gap_ns;
|
jpayne@69
|
2052
|
jpayne@69
|
2053 fclose (fp);
|
jpayne@69
|
2054
|
jpayne@69
|
2055 fprintf (stderr, "\n%s Sequence Coverage:\n", tag);
|
jpayne@69
|
2056 fprintf (stderr, " Sequence length = %ld\n", seq_len);
|
jpayne@69
|
2057 fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns);
|
jpayne@69
|
2058 fprintf (stderr, " Non acgt's = %ld\n", total_ns);
|
jpayne@69
|
2059 fprintf (stderr, " Number of regions = %d\n", ct);
|
jpayne@69
|
2060 fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
|
jpayne@69
|
2061 cov_total,
|
jpayne@69
|
2062 seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len,
|
jpayne@69
|
2063 seq_len - total_ns == 0.0 ? 0.0 :
|
jpayne@69
|
2064 100.0 * (cov_total - cov_ns) / (seq_len - total_ns));
|
jpayne@69
|
2065 fprintf (stderr, " Avg match len = %.0f\n",
|
jpayne@69
|
2066 ct == 0 ? 0.0 : cov_total / ct);
|
jpayne@69
|
2067 fprintf (stderr, " N's in matches = %ld (%.1f%%)\n",
|
jpayne@69
|
2068 cov_ns,
|
jpayne@69
|
2069 cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total);
|
jpayne@69
|
2070 fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
|
jpayne@69
|
2071 gap_total,
|
jpayne@69
|
2072 seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len,
|
jpayne@69
|
2073 seq_len - total_ns == 0.0 ? 0.0 :
|
jpayne@69
|
2074 100.0 * (gap_total - gap_ns) / (seq_len - total_ns));
|
jpayne@69
|
2075 fprintf (stderr, " Avg gap len = %.0f\n",
|
jpayne@69
|
2076 gap_total / (1.0 + ct));
|
jpayne@69
|
2077 fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n",
|
jpayne@69
|
2078 gap_ns,
|
jpayne@69
|
2079 gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total);
|
jpayne@69
|
2080
|
jpayne@69
|
2081 return;
|
jpayne@69
|
2082 }
|
jpayne@69
|
2083
|
jpayne@69
|
2084
|
jpayne@69
|
2085
|
jpayne@69
|
2086 void Show_Diffs
|
jpayne@69
|
2087 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
|
jpayne@69
|
2088 long int a_ref, long int b_ref)
|
jpayne@69
|
2089
|
jpayne@69
|
2090 // Show (to stdout ) the differences
|
jpayne@69
|
2091 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
|
jpayne@69
|
2092 // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to
|
jpayne@69
|
2093 // the start of string a . b_ref is the offset to the start of
|
jpayne@69
|
2094 // string b .
|
jpayne@69
|
2095
|
jpayne@69
|
2096 {
|
jpayne@69
|
2097 int i, j, k, m;
|
jpayne@69
|
2098
|
jpayne@69
|
2099 i = j = 0;
|
jpayne@69
|
2100 for (k = 0; k < delta_ct; k ++)
|
jpayne@69
|
2101 {
|
jpayne@69
|
2102 for (m = 1; m < abs (delta [k]); m ++)
|
jpayne@69
|
2103 {
|
jpayne@69
|
2104 if (a [i] != b [j])
|
jpayne@69
|
2105 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
|
jpayne@69
|
2106 i ++;
|
jpayne@69
|
2107 j ++;
|
jpayne@69
|
2108 }
|
jpayne@69
|
2109 if (delta [k] < 0)
|
jpayne@69
|
2110 {
|
jpayne@69
|
2111 printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]);
|
jpayne@69
|
2112 j ++;
|
jpayne@69
|
2113 }
|
jpayne@69
|
2114 else
|
jpayne@69
|
2115 {
|
jpayne@69
|
2116 printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]);
|
jpayne@69
|
2117 i ++;
|
jpayne@69
|
2118 }
|
jpayne@69
|
2119 }
|
jpayne@69
|
2120 while (i < a_len && j < b_len)
|
jpayne@69
|
2121 {
|
jpayne@69
|
2122 if (a [i] != b [j])
|
jpayne@69
|
2123 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
|
jpayne@69
|
2124 i ++;
|
jpayne@69
|
2125 j ++;
|
jpayne@69
|
2126 }
|
jpayne@69
|
2127
|
jpayne@69
|
2128 return;
|
jpayne@69
|
2129 }
|
jpayne@69
|
2130
|
jpayne@69
|
2131
|
jpayne@69
|
2132
|
jpayne@69
|
2133 int Sign
|
jpayne@69
|
2134 (int a)
|
jpayne@69
|
2135
|
jpayne@69
|
2136 // Return the algebraic sign of a .
|
jpayne@69
|
2137
|
jpayne@69
|
2138 {
|
jpayne@69
|
2139 if (a > 0)
|
jpayne@69
|
2140 return 1;
|
jpayne@69
|
2141 else if (a < 0)
|
jpayne@69
|
2142 return -1;
|
jpayne@69
|
2143
|
jpayne@69
|
2144 return 0;
|
jpayne@69
|
2145 }
|
jpayne@69
|
2146
|
jpayne@69
|
2147
|
jpayne@69
|
2148
|
jpayne@69
|
2149 void Usage
|
jpayne@69
|
2150 (char * command)
|
jpayne@69
|
2151
|
jpayne@69
|
2152 // Print to stderr description of options and command line for
|
jpayne@69
|
2153 // this program. command is the command that was used to
|
jpayne@69
|
2154 // invoke it.
|
jpayne@69
|
2155
|
jpayne@69
|
2156 {
|
jpayne@69
|
2157 fprintf (stderr,
|
jpayne@69
|
2158 "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n"
|
jpayne@69
|
2159 "\n"
|
jpayne@69
|
2160 "Combines MUMs in <GapsFile> by extending matches off\n"
|
jpayne@69
|
2161 "ends and between MUMs. <RefSequence> is a fasta file\n"
|
jpayne@69
|
2162 "of the reference sequence. <MatchSequences> is a\n"
|
jpayne@69
|
2163 "multi-fasta file of the sequences matched against the\n"
|
jpayne@69
|
2164 "reference\n"
|
jpayne@69
|
2165 "\n"
|
jpayne@69
|
2166 "Options:\n"
|
jpayne@69
|
2167 "-D Only output to stdout the difference positions\n"
|
jpayne@69
|
2168 " and characters\n"
|
jpayne@69
|
2169 "-n Allow matches only between nucleotides, i.e., ACGTs\n"
|
jpayne@69
|
2170 "-N num Break matches at <num> or more consecutive non-ACGTs \n"
|
jpayne@69
|
2171 "-q tag Used to label query match\n"
|
jpayne@69
|
2172 "-r tag Used to label reference match\n"
|
jpayne@69
|
2173 "-S Output all differences in strings\n"
|
jpayne@69
|
2174 "-t Label query matches with query fasta header\n"
|
jpayne@69
|
2175 "-v num Set verbose level for extra output\n"
|
jpayne@69
|
2176 "-W file Reset the default output filename witherrors.gaps\n"
|
jpayne@69
|
2177 "-x Don't output .cover files\n"
|
jpayne@69
|
2178 "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n",
|
jpayne@69
|
2179
|
jpayne@69
|
2180 command);
|
jpayne@69
|
2181
|
jpayne@69
|
2182 return;
|
jpayne@69
|
2183 }
|