annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/combineMUMs.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 /*************************************************
jpayne@69 2 * Module: CombineMUMs.cc
jpayne@69 3 * Author: Art Delcher
jpayne@69 4 * Description:
jpayne@69 5 * Take clusters of MUMs (from mgaps program) and do alignments
jpayne@69 6 * to combine them into matches.
jpayne@69 7 *
jpayne@69 8 */
jpayne@69 9
jpayne@69 10
jpayne@69 11 #include "tigrinc.hh"
jpayne@69 12 #include <string>
jpayne@69 13 using namespace std;
jpayne@69 14
jpayne@69 15
jpayne@69 16 // Constants
jpayne@69 17
jpayne@69 18 #define BRANCH_PT_MATCH_VALUE 0.272
jpayne@69 19 // Value to add for a match in finding branch points
jpayne@69 20 // 1.20 was the calculated value for 6% vs 35% error discrimination
jpayne@69 21 // Converting to integers didn't make it faster
jpayne@69 22 #define BRANCH_PT_ERROR_VALUE -0.728
jpayne@69 23 // Value to add for a mismatch in finding branch points
jpayne@69 24 // -2.19 was the calculated value for 6% vs 35% error discrimination
jpayne@69 25 // Converting to integers didn't make it faster
jpayne@69 26 #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps"
jpayne@69 27 // Name of file produced that matches the input gaps file
jpayne@69 28 // but with an extra column showing the number of errors in
jpayne@69 29 // each gap
jpayne@69 30 #define DEFAULT_PAD 10
jpayne@69 31 // Default number of matching bases to show at beginning
jpayne@69 32 // and end of alignments
jpayne@69 33 #define EDIT_DIST_PROB_BOUND 1e-4
jpayne@69 34 // Probability limit to "band" edit-distance calculation
jpayne@69 35 // Determines NORMAL_DISTRIB_THOLD
jpayne@69 36 #define ERRORS_FOR_FREE 1
jpayne@69 37 // The number of errors that are ignored in setting probability
jpayne@69 38 // bound for terminating alignment extensions in edit distance
jpayne@69 39 // calculations
jpayne@69 40 #define EXPANSION_FACTOR 1.4
jpayne@69 41 // Factor by which to grow memory when realloc'ing
jpayne@69 42 #define GIVE_UP_LEN 200
jpayne@69 43 // Stop alignment when go this many bases past max-score value
jpayne@69 44 #define INITIAL_BUFFER_SIZE 10000
jpayne@69 45 // Initial number of bytes to use for sequence strings
jpayne@69 46 #define MATCH_SCORE 1.0
jpayne@69 47 // Score to add for match in finding length of alignment
jpayne@69 48 #define MAX_ERROR_RATE 0.06
jpayne@69 49 // The largest error allowed in overlaps
jpayne@69 50 #define MAX_FRAG_LEN 1024
jpayne@69 51 // The longest fragment allowed
jpayne@69 52 #define MAX_ERRORS 500
jpayne@69 53 // Most errors in any edit distance computation
jpayne@69 54 #define MAX_EXTENSION 10000
jpayne@69 55 // Maximum extension match out from a match
jpayne@69 56 #define MAX_HDR_LEN 10000
jpayne@69 57 // Longest allowable fasta header line
jpayne@69 58 #define MAX_MEMORY_STORE 50000
jpayne@69 59 // The most fragments allowed in memory store
jpayne@69 60 #define MIN_BRANCH_END_DIST 20
jpayne@69 61 // Branch points must be at least this many bases from the
jpayne@69 62 // end of the fragment to be reported
jpayne@69 63 #define MIN_BRANCH_TAIL_SLOPE 0.20
jpayne@69 64 // Branch point tails must fall off from the max by at least
jpayne@69 65 // this rate
jpayne@69 66 #define MIN_MATCH_LEN 40
jpayne@69 67 // Number of bases in match region in order to count it
jpayne@69 68 #define MISMATCH_SCORE -3.0
jpayne@69 69 // Score to add for non-match in finding length of alignment
jpayne@69 70 #define NORMAL_DISTRIB_THOLD 3.62
jpayne@69 71 // Determined by EDIT_DIST_PROB_BOUND
jpayne@69 72 #define REALLY_VERBOSE 0
jpayne@69 73 // If 1 prints tons of stuff
jpayne@69 74 #define VERBOSE 0
jpayne@69 75 // If 1 prints lots of stuff
jpayne@69 76
jpayne@69 77
jpayne@69 78
jpayne@69 79 // Type definitions
jpayne@69 80
jpayne@69 81 typedef struct s_Cover_t
jpayne@69 82 {
jpayne@69 83 long int lo, hi;
jpayne@69 84 struct s_Cover_t * next;
jpayne@69 85 } Cover_t;
jpayne@69 86
jpayne@69 87
jpayne@69 88
jpayne@69 89 // Globals
jpayne@69 90
jpayne@69 91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE;
jpayne@69 92 // Score used for matches to locate branch points, i.e.,
jpayne@69 93 // where alignment score peaks
jpayne@69 94 float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE;
jpayne@69 95 // Score used for mismatches to locate branch points, i.e.,
jpayne@69 96 int Consec_Non_ACGT = 0;
jpayne@69 97 // Stop alignments when encounter at least this many non-acgt characters.
jpayne@69 98 int * Edit_Array [MAX_ERRORS];
jpayne@69 99 // Use for alignment calculation. Points into Edit_Space .
jpayne@69 100 int Edit_Match_Limit [MAX_ERRORS] = {0};
jpayne@69 101 // This array [e] is the minimum value of Edit_Array [e] [d]
jpayne@69 102 // to be worth pursuing in edit-distance computations between guides
jpayne@69 103 int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS];
jpayne@69 104 // Memory used by alignment calculation
jpayne@69 105 int Error_Bound [MAX_FRAG_LEN + 1];
jpayne@69 106 // This array [i] is the maximum number of errors allowed
jpayne@69 107 // in a match between sequences of length i , which is
jpayne@69 108 // i * MAXERROR_RATE .
jpayne@69 109 char * Error_File_Name = DEFAULT_ERROR_FILE_NAME;
jpayne@69 110 // Name of file to write gaps listing with # errors in each gap
jpayne@69 111 int Fill_Ct = 0;
jpayne@69 112 // Number of non-acgt bases in ref sequence
jpayne@69 113 char * Gaps_File_Path = NULL;
jpayne@69 114 // Name of file produced by mgaps program
jpayne@69 115 char * Match_File_Path = NULL;
jpayne@69 116 // Name of multifasta file of sequences to compare against the reference
jpayne@69 117 // sequence
jpayne@69 118 int UserScoring = FALSE;
jpayne@69 119 // If TRUE, then user specified a percent ID cutoff and scoring
jpayne@69 120 // is adjusted to enforce this in extensions
jpayne@69 121 int Nucleotides_Only = FALSE;
jpayne@69 122 // If TRUE , then only acgt's can match
jpayne@69 123 bool Only_Difference_Positions = false;
jpayne@69 124 // If true , then output just positions of difference instead of
jpayne@69 125 // alignments between exact matches
jpayne@69 126 int Output_Cover_Files = TRUE;
jpayne@69 127 // If TRUE , output files showing coverage of each genome.
jpayne@69 128 double Percent_ID;
jpayne@69 129 // Desired identity rate of matches to be found
jpayne@69 130 // Expressed as a fraction, not really a percent
jpayne@69 131 char * Query = NULL;
jpayne@69 132 // The query sequence
jpayne@69 133 long int Query_Len;
jpayne@69 134 // The length of the query sequence
jpayne@69 135 char * Query_Suffix = "Query";
jpayne@69 136 // Suffix for query tag
jpayne@69 137 char * Ref = NULL;
jpayne@69 138 // The reference sequence
jpayne@69 139 char * Ref_File_Path = NULL;
jpayne@69 140 // Name of (single) fasta file of reference sequence
jpayne@69 141 long int Ref_Len;
jpayne@69 142 // The length of the reference sequence
jpayne@69 143 long int Ref_Size;
jpayne@69 144 // The size of the reference sequence buffer
jpayne@69 145 char * Ref_Suffix = "Ref";
jpayne@69 146 // Suffix for reference tag
jpayne@69 147 int Show_Differences = FALSE;
jpayne@69 148 // If TRUE then show differences in all alignments
jpayne@69 149 int Tag_From_Fasta_Line = FALSE;
jpayne@69 150 // If TRUE then use fasta tag from ref & query sequences as
jpayne@69 151 // 1st & 2nd column, resp., on match line to identify matches
jpayne@69 152 int Verbose = 0;
jpayne@69 153 // Controls printing of extra debuggin information
jpayne@69 154
jpayne@69 155
jpayne@69 156 bool Global_Debug_Flag = false;
jpayne@69 157
jpayne@69 158
jpayne@69 159
jpayne@69 160 // Functions
jpayne@69 161
jpayne@69 162 void Add_Coverage
jpayne@69 163 (Cover_t * * list, long int lo, long int hi);
jpayne@69 164 int Binomial_Bound
jpayne@69 165 (int, double, int, double);
jpayne@69 166 void Display_Alignment
jpayne@69 167 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
jpayne@69 168 void Display_Alignment_With_Pad
jpayne@69 169 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 170 char * left_pad [3], char * right_pad [3], int pad_len);
jpayne@69 171 void Display_Difference_Positions
jpayne@69 172 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
jpayne@69 173 int b_incr, int delta [], int delta_ct, const char * a_tag,
jpayne@69 174 const char * b_tag);
jpayne@69 175 int Extend_Backward
jpayne@69 176 (long int * ref_lo, long int * query_lo);
jpayne@69 177 int Extend_Forward
jpayne@69 178 (long int * ref_hi, long int * query_hi);
jpayne@69 179 void Initialize_Globals
jpayne@69 180 (void);
jpayne@69 181 FILE * Local_File_Open
jpayne@69 182 (const char * filename, const char * mode);
jpayne@69 183 int Max_int
jpayne@69 184 (int a, int b);
jpayne@69 185 int Min_int
jpayne@69 186 (int a, int b);
jpayne@69 187 void Parse_Command_Line
jpayne@69 188 (int argc, char * argv []);
jpayne@69 189 int Prefix_Edit_Dist
jpayne@69 190 (char A [], int m, char T [], int n, int Error_Limit,
jpayne@69 191 int * A_End, int * T_End, int * Match_To_End,
jpayne@69 192 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
jpayne@69 193 int Read_String
jpayne@69 194 (FILE * fp, char * * T, long int * Size, char header []);
jpayne@69 195 void Rev_Complement
jpayne@69 196 (char * s);
jpayne@69 197 void Rev_Display_Alignment
jpayne@69 198 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
jpayne@69 199 int Rev_Prefix_Edit_Dist
jpayne@69 200 (char A [], int m, char T [], int n, int Error_Limit,
jpayne@69 201 int * A_End, int * T_End, int * Match_To_End,
jpayne@69 202 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
jpayne@69 203 void Rev_Show_Diffs
jpayne@69 204 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 205 long int a_ref, long int b_ref);
jpayne@69 206 void Set_Deltas
jpayne@69 207 (int delta [], int * delta_len, int row, int d, int e);
jpayne@69 208 static void Set_Left_Pad
jpayne@69 209 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len);
jpayne@69 210 static void Set_Right_Pad
jpayne@69 211 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len);
jpayne@69 212 void Show_Coverage
jpayne@69 213 (Cover_t * list, char * filename, char * tag, char * seq);
jpayne@69 214 void Show_Diffs
jpayne@69 215 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 216 long int a_ref, long int b_ref);
jpayne@69 217 int Sign
jpayne@69 218 (int a);
jpayne@69 219 void Usage
jpayne@69 220 (char * command);
jpayne@69 221
jpayne@69 222
jpayne@69 223
jpayne@69 224 int main
jpayne@69 225 (int argc, char * argv [])
jpayne@69 226
jpayne@69 227 {
jpayne@69 228 FILE * fp, * match_fp, * error_fp;
jpayne@69 229 char ref_header [MAX_HDR_LEN];
jpayne@69 230 char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN];
jpayne@69 231 char line [MAX_LINE], filename [MAX_LINE];
jpayne@69 232 char * left_pad [3], * right_pad [3];
jpayne@69 233 long int ref_pos, ref_lo, ref_hi;
jpayne@69 234 long int query_pos, query_lo, query_hi, query_size;
jpayne@69 235 long int match_len, prior_match_len = 0, max_len;
jpayne@69 236 double error = 0.0;
jpayne@69 237 long int total_errors = 0;
jpayne@69 238 int match_ct = 0;
jpayne@69 239 double match_total = 0.0;
jpayne@69 240 int is_forward = FALSE;
jpayne@69 241 int first_match = TRUE;
jpayne@69 242 Cover_t * ref_cover_list = NULL;
jpayne@69 243 Cover_t * query_cover_list = NULL;
jpayne@69 244 char * p;
jpayne@69 245 int i;
jpayne@69 246
jpayne@69 247 Parse_Command_Line (argc, argv);
jpayne@69 248
jpayne@69 249 if(UserScoring){
jpayne@69 250 // percentID = mismatch penalty / (match bonus + mismatch penalty)
jpayne@69 251 // or, more compactly, p = mm / (m+mm)
jpayne@69 252 // Since we require that m+mm = 1, m=1-mm
jpayne@69 253 // and so we get the trivial mm = p
jpayne@69 254 //
jpayne@69 255 // This means that Branch_Pt_Error_Value = -p
jpayne@69 256 // and then Branch_Pt_Match_Value = 1-p
jpayne@69 257 //
jpayne@69 258 // Make it so:
jpayne@69 259 Branch_Pt_Error_Value = - Percent_ID;
jpayne@69 260 Branch_Pt_Match_Value = 1.0 - Percent_ID;
jpayne@69 261 }
jpayne@69 262
jpayne@69 263 Initialize_Globals ();
jpayne@69 264
jpayne@69 265 p = strrchr (Ref_File_Path, '/');
jpayne@69 266 if (p == NULL)
jpayne@69 267 strcpy (ref_tag, Ref_File_Path);
jpayne@69 268 else
jpayne@69 269 strcpy (ref_tag, p + 1);
jpayne@69 270 p = strrchr (ref_tag, '.');
jpayne@69 271 if (p != NULL)
jpayne@69 272 (* p) = '\0';
jpayne@69 273 strcat (ref_tag, Ref_Suffix);
jpayne@69 274
jpayne@69 275 p = strrchr (Match_File_Path, '/');
jpayne@69 276 if (p == NULL)
jpayne@69 277 strcpy (query_tag, Match_File_Path);
jpayne@69 278 else
jpayne@69 279 strcpy (query_tag, p + 1);
jpayne@69 280 p = strrchr (query_tag, '.');
jpayne@69 281 if (p != NULL)
jpayne@69 282 (* p) = '\0';
jpayne@69 283 strcat (query_tag, Query_Suffix);
jpayne@69 284
jpayne@69 285 fp = Local_File_Open (Ref_File_Path, "r");
jpayne@69 286 Ref_Size = 100000;
jpayne@69 287 Ref = (char *) Safe_malloc (Ref_Size);
jpayne@69 288
jpayne@69 289 assert (Read_String (fp, & Ref, & Ref_Size, ref_header));
jpayne@69 290 if (Tag_From_Fasta_Line)
jpayne@69 291 strcpy (ref_tag, strtok (ref_header, " \t\n>"));
jpayne@69 292 fclose (fp);
jpayne@69 293
jpayne@69 294 Ref_Len = strlen (Ref + 1);
jpayne@69 295 Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len);
jpayne@69 296 for (i = 1; i <= Ref_Len; i ++)
jpayne@69 297 switch (Ref [i])
jpayne@69 298 {
jpayne@69 299 case 'a' :
jpayne@69 300 case 'c' :
jpayne@69 301 case 'g' :
jpayne@69 302 case 't' :
jpayne@69 303 break;
jpayne@69 304 default :
jpayne@69 305 if (Nucleotides_Only)
jpayne@69 306 Ref [i] = '2';
jpayne@69 307 Fill_Ct ++;
jpayne@69 308 }
jpayne@69 309
jpayne@69 310 for (i = 0; i < 3; i ++)
jpayne@69 311 {
jpayne@69 312 left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
jpayne@69 313 right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
jpayne@69 314 }
jpayne@69 315
jpayne@69 316 fp = Local_File_Open (Gaps_File_Path, "r");
jpayne@69 317 match_fp = Local_File_Open (Match_File_Path, "r");
jpayne@69 318 query_size = 100000;
jpayne@69 319 Query = (char *) Safe_malloc (query_size);
jpayne@69 320
jpayne@69 321 error_fp = Local_File_Open (Error_File_Name, "w");
jpayne@69 322
jpayne@69 323 while (fgets (line, MAX_LINE, fp) != NULL)
jpayne@69 324 {
jpayne@69 325 int line_len = strlen (line);
jpayne@69 326
jpayne@69 327 if (line [0] == '>')
jpayne@69 328 {
jpayne@69 329 if (! first_match)
jpayne@69 330 {
jpayne@69 331 total_errors += Extend_Forward (& ref_hi, & query_hi);
jpayne@69 332 max_len = 1 + ref_hi - ref_lo;
jpayne@69 333 if (1 + abs (query_hi - query_lo) > max_len)
jpayne@69 334 max_len = 1 + abs (query_hi - query_lo);
jpayne@69 335 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
jpayne@69 336 if (! Only_Difference_Positions)
jpayne@69 337 printf
jpayne@69 338 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
jpayne@69 339 ref_lo, ref_hi,
jpayne@69 340 is_forward ? query_lo : 1 + Query_Len - query_lo,
jpayne@69 341 is_forward ? query_hi : 1 + Query_Len - query_hi,
jpayne@69 342 total_errors, max_len, 100.0 * error);
jpayne@69 343
jpayne@69 344 match_ct ++;
jpayne@69 345 match_total += 1 + ref_hi - ref_lo;
jpayne@69 346 total_errors = 0;
jpayne@69 347 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
jpayne@69 348 if (is_forward)
jpayne@69 349 Add_Coverage (& query_cover_list, query_lo, query_hi);
jpayne@69 350 else
jpayne@69 351 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
jpayne@69 352 1 + Query_Len - query_lo);
jpayne@69 353 }
jpayne@69 354 if (Tag_From_Fasta_Line)
jpayne@69 355 {
jpayne@69 356 char buff [MAX_LINE];
jpayne@69 357 char * p;
jpayne@69 358
jpayne@69 359 strcpy (buff, line + 1);
jpayne@69 360 p = strtok (buff, " >\t\n");
jpayne@69 361 if (strlen (p) > 0)
jpayne@69 362 strcpy (query_tag, p);
jpayne@69 363 else
jpayne@69 364 {
jpayne@69 365 fprintf (stderr, "No tag on line %s", line);
jpayne@69 366 strcpy (query_tag, "???");
jpayne@69 367 }
jpayne@69 368 }
jpayne@69 369 is_forward = ! is_forward;
jpayne@69 370 if (is_forward)
jpayne@69 371 {
jpayne@69 372 assert (Read_String (match_fp, & Query, & query_size, header));
jpayne@69 373 Query_Len = strlen (Query + 1);
jpayne@69 374 if (Nucleotides_Only)
jpayne@69 375 {
jpayne@69 376 for (i = 1; i <= Query_Len; i ++)
jpayne@69 377 switch (Query [i])
jpayne@69 378 {
jpayne@69 379 case 'a' :
jpayne@69 380 case 'c' :
jpayne@69 381 case 'g' :
jpayne@69 382 case 't' :
jpayne@69 383 break;
jpayne@69 384 default :
jpayne@69 385 Query [i] = '1';
jpayne@69 386 }
jpayne@69 387 }
jpayne@69 388 }
jpayne@69 389 else
jpayne@69 390 Rev_Complement (Query + 1);
jpayne@69 391 first_match = TRUE;
jpayne@69 392 printf ("%s", line);
jpayne@69 393 fprintf (error_fp, "%s", line);
jpayne@69 394 }
jpayne@69 395 else if (line [0] == '#')
jpayne@69 396 {
jpayne@69 397 if (! first_match)
jpayne@69 398 {
jpayne@69 399 total_errors += Extend_Forward (& ref_hi, & query_hi);
jpayne@69 400 max_len = 1 + ref_hi - ref_lo;
jpayne@69 401 if (1 + abs (query_hi - query_lo) > max_len)
jpayne@69 402 max_len = 1 + abs (query_hi - query_lo);
jpayne@69 403 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
jpayne@69 404 if (! Only_Difference_Positions)
jpayne@69 405 printf
jpayne@69 406 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
jpayne@69 407 ref_lo, ref_hi,
jpayne@69 408 is_forward ? query_lo : 1 + Query_Len - query_lo,
jpayne@69 409 is_forward ? query_hi : 1 + Query_Len - query_hi,
jpayne@69 410 total_errors, max_len, 100.0 * error);
jpayne@69 411
jpayne@69 412 match_ct ++;
jpayne@69 413 match_total += 1 + ref_hi - ref_lo;
jpayne@69 414 total_errors = 0;
jpayne@69 415 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
jpayne@69 416 if (is_forward)
jpayne@69 417 Add_Coverage (& query_cover_list, query_lo, query_hi);
jpayne@69 418 else
jpayne@69 419 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
jpayne@69 420 1 + Query_Len - query_lo);
jpayne@69 421 }
jpayne@69 422 first_match = TRUE;
jpayne@69 423 printf ("%s", line);
jpayne@69 424 fprintf (error_fp, "%s", line);
jpayne@69 425 }
jpayne@69 426 else
jpayne@69 427 {
jpayne@69 428 assert (sscanf (line, "%ld %ld %ld",
jpayne@69 429 & ref_pos, & query_pos, & match_len) == 3);
jpayne@69 430 if (first_match)
jpayne@69 431 {
jpayne@69 432 ref_lo = ref_pos;
jpayne@69 433 query_lo = query_pos;
jpayne@69 434 total_errors += Extend_Backward (& ref_lo, & query_lo);
jpayne@69 435 if (! Only_Difference_Positions)
jpayne@69 436 printf ("%s", line);
jpayne@69 437 if (line_len > 0)
jpayne@69 438 line [line_len - 1] = '\0';
jpayne@69 439 fprintf (error_fp, "%s %7s\n", line, "-");
jpayne@69 440 }
jpayne@69 441 else
jpayne@69 442 {
jpayne@69 443 int errors;
jpayne@69 444 int a_len, b_len;
jpayne@69 445 int a_end, b_end, match_to_end;
jpayne@69 446 int delta [MAX_ERRORS], delta_len;
jpayne@69 447
jpayne@69 448 a_len = ref_pos - ref_hi - 1;
jpayne@69 449 b_len = query_pos - query_hi - 1;
jpayne@69 450
jpayne@69 451 errors = Prefix_Edit_Dist
jpayne@69 452 (Ref + ref_hi + 1, a_len,
jpayne@69 453 Query + query_hi + 1, b_len,
jpayne@69 454 MAX_ERRORS - 1, & a_end, & b_end,
jpayne@69 455 & match_to_end, delta, & delta_len, FALSE);
jpayne@69 456 if (Show_Differences)
jpayne@69 457 Show_Diffs (Ref + ref_hi + 1, a_end,
jpayne@69 458 Query + query_hi + 1, b_end,
jpayne@69 459 delta, delta_len, ref_hi + 1, query_hi + 1);
jpayne@69 460
jpayne@69 461 if (! Only_Difference_Positions)
jpayne@69 462 printf ("%s", line);
jpayne@69 463
jpayne@69 464 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len,
jpayne@69 465 DEFAULT_PAD);
jpayne@69 466 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len,
jpayne@69 467 DEFAULT_PAD);
jpayne@69 468
jpayne@69 469 if (a_end == 0 && b_end == 0)
jpayne@69 470 {
jpayne@69 471 if (! Only_Difference_Positions)
jpayne@69 472 printf (" No extension\n");
jpayne@69 473
jpayne@69 474 if (line_len > 0)
jpayne@69 475 line [line_len - 1] = '\0';
jpayne@69 476 fprintf (error_fp, "%s %7s\n", line, "-");
jpayne@69 477 }
jpayne@69 478 else if (Only_Difference_Positions)
jpayne@69 479 {
jpayne@69 480 int q_start, b_incr;
jpayne@69 481
jpayne@69 482 if (is_forward)
jpayne@69 483 {
jpayne@69 484 q_start = query_hi + 1;
jpayne@69 485 b_incr = 1;
jpayne@69 486 }
jpayne@69 487 else
jpayne@69 488 {
jpayne@69 489 q_start = Query_Len - query_hi;
jpayne@69 490 b_incr = -1;
jpayne@69 491 }
jpayne@69 492 Display_Difference_Positions
jpayne@69 493 (Ref + ref_hi + 1, ref_hi + 1, a_end,
jpayne@69 494 Query + query_hi + 1, q_start, b_end,
jpayne@69 495 b_incr, delta, delta_len, ref_tag,
jpayne@69 496 query_tag);
jpayne@69 497 }
jpayne@69 498 else
jpayne@69 499 {
jpayne@69 500 printf (" Errors = %d\n", errors);
jpayne@69 501 if (! Only_Difference_Positions)
jpayne@69 502 Display_Alignment_With_Pad
jpayne@69 503 (Ref + ref_hi + 1, a_end,
jpayne@69 504 Query + query_hi + 1, b_end,
jpayne@69 505 delta, delta_len, left_pad, right_pad,
jpayne@69 506 DEFAULT_PAD);
jpayne@69 507 if (line_len > 0)
jpayne@69 508 line [line_len - 1] = '\0';
jpayne@69 509 fprintf (error_fp, "%s %7d\n", line, errors);
jpayne@69 510 }
jpayne@69 511
jpayne@69 512 total_errors += errors;
jpayne@69 513
jpayne@69 514 if (! match_to_end)
jpayne@69 515 {
jpayne@69 516 ref_hi += a_end;
jpayne@69 517 query_hi += b_end;
jpayne@69 518 max_len = 1 + ref_hi - ref_lo;
jpayne@69 519 if (1 + abs (query_hi - query_lo) > max_len)
jpayne@69 520 max_len = 1 + abs (query_hi - query_lo);
jpayne@69 521 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
jpayne@69 522 if (! Only_Difference_Positions)
jpayne@69 523 printf (
jpayne@69 524 "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
jpayne@69 525 ref_lo, ref_hi,
jpayne@69 526 is_forward ? query_lo : 1 + Query_Len - query_lo,
jpayne@69 527 is_forward ? query_hi : 1 + Query_Len - query_hi,
jpayne@69 528 total_errors, max_len, 100.0 * error);
jpayne@69 529
jpayne@69 530 match_ct ++;
jpayne@69 531 match_total += 1 + ref_hi - ref_lo;
jpayne@69 532 total_errors = 0;
jpayne@69 533 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
jpayne@69 534 if (is_forward)
jpayne@69 535 Add_Coverage (& query_cover_list, query_lo, query_hi);
jpayne@69 536 else
jpayne@69 537 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
jpayne@69 538 1 + Query_Len - query_lo);
jpayne@69 539 ref_lo = ref_pos;
jpayne@69 540 query_lo = query_pos;
jpayne@69 541 total_errors += Extend_Backward (& ref_lo, & query_lo);
jpayne@69 542 }
jpayne@69 543 }
jpayne@69 544
jpayne@69 545 ref_hi = ref_pos + match_len - 1;
jpayne@69 546 query_hi = query_pos + match_len - 1;
jpayne@69 547 prior_match_len = match_len;
jpayne@69 548 first_match = FALSE;
jpayne@69 549 }
jpayne@69 550 }
jpayne@69 551
jpayne@69 552 if (! first_match)
jpayne@69 553 {
jpayne@69 554 total_errors += Extend_Forward (& ref_hi, & query_hi);
jpayne@69 555 max_len = 1 + ref_hi - ref_lo;
jpayne@69 556 if (1 + abs (query_hi - query_lo) > max_len)
jpayne@69 557 max_len = 1 + abs (query_hi - query_lo);
jpayne@69 558 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
jpayne@69 559 if (! Only_Difference_Positions)
jpayne@69 560 printf
jpayne@69 561 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
jpayne@69 562 ref_lo, ref_hi,
jpayne@69 563 is_forward ? query_lo : 1 + Query_Len - query_lo,
jpayne@69 564 is_forward ? query_hi : 1 + Query_Len - query_hi,
jpayne@69 565 total_errors, max_len, 100.0 * error);
jpayne@69 566
jpayne@69 567 match_ct ++;
jpayne@69 568 match_total += 1 + ref_hi - ref_lo;
jpayne@69 569 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
jpayne@69 570 if (is_forward)
jpayne@69 571 Add_Coverage (& query_cover_list, query_lo, query_hi);
jpayne@69 572 else
jpayne@69 573 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
jpayne@69 574 1 + Query_Len - query_lo);
jpayne@69 575 }
jpayne@69 576
jpayne@69 577 fprintf (stderr, " Ref len = %ld\n", Ref_Len);
jpayne@69 578 fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct);
jpayne@69 579 fprintf (stderr, " Non acgt's = %d\n", Fill_Ct);
jpayne@69 580 fprintf (stderr, " Number of matches = %d\n", match_ct);
jpayne@69 581 fprintf (stderr, "Sum of match bases = %.0f\n", match_total);
jpayne@69 582 fprintf (stderr, " Avg match bases = %.0f\n",
jpayne@69 583 match_ct == 0 ? 0.0 : match_total / match_ct);
jpayne@69 584
jpayne@69 585 fclose (error_fp);
jpayne@69 586
jpayne@69 587 if (Output_Cover_Files)
jpayne@69 588 {
jpayne@69 589 strcpy (filename, ref_tag);
jpayne@69 590 strcat (filename, ".cover");
jpayne@69 591 Show_Coverage (ref_cover_list, filename, ref_tag, Ref);
jpayne@69 592
jpayne@69 593 strcpy (filename, query_tag);
jpayne@69 594 strcat (filename, ".cover");
jpayne@69 595 Rev_Complement (Query + 1);
jpayne@69 596 Show_Coverage (query_cover_list, filename, query_tag, Query);
jpayne@69 597 }
jpayne@69 598
jpayne@69 599 return 0;
jpayne@69 600 }
jpayne@69 601
jpayne@69 602
jpayne@69 603
jpayne@69 604 void Add_Coverage
jpayne@69 605 (Cover_t * * list, long int lo, long int hi)
jpayne@69 606
jpayne@69 607 // Add lo .. hi to list of regions covered in (* list) .
jpayne@69 608 // Combine nodes when appropriate.
jpayne@69 609
jpayne@69 610 {
jpayne@69 611 Cover_t * new_node, * p, * prev = NULL;
jpayne@69 612
jpayne@69 613 if ((* list) == NULL || hi + 1 < (* list) -> lo)
jpayne@69 614 {
jpayne@69 615 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
jpayne@69 616 new_node -> lo = lo;
jpayne@69 617 new_node -> hi = hi;
jpayne@69 618 new_node -> next = (* list);
jpayne@69 619 (* list) = new_node;
jpayne@69 620 return;
jpayne@69 621 }
jpayne@69 622
jpayne@69 623 for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next)
jpayne@69 624 prev = p;
jpayne@69 625
jpayne@69 626 if (p == NULL || hi + 1 < p -> lo)
jpayne@69 627 { // insert between or on end
jpayne@69 628 assert (prev != NULL);
jpayne@69 629 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
jpayne@69 630 new_node -> lo = lo;
jpayne@69 631 new_node -> hi = hi;
jpayne@69 632 new_node -> next = p;
jpayne@69 633 prev -> next = new_node;
jpayne@69 634 return;
jpayne@69 635 }
jpayne@69 636
jpayne@69 637 if (lo < p -> lo)
jpayne@69 638 p -> lo = lo;
jpayne@69 639 while (p -> next != NULL && hi + 1 >= p -> next -> lo)
jpayne@69 640 {
jpayne@69 641 Cover_t * save;
jpayne@69 642
jpayne@69 643 p -> hi = p -> next -> hi;
jpayne@69 644 save = p -> next;
jpayne@69 645 p -> next = save -> next;
jpayne@69 646 free (save);
jpayne@69 647 }
jpayne@69 648 if (hi > p -> hi)
jpayne@69 649 p -> hi = hi;
jpayne@69 650
jpayne@69 651 return;
jpayne@69 652 }
jpayne@69 653
jpayne@69 654
jpayne@69 655
jpayne@69 656 int Binomial_Bound
jpayne@69 657 (int e, double p, int Start, double Limit)
jpayne@69 658
jpayne@69 659 // Return the smallest n >= Start s.t.
jpayne@69 660 // prob [>= e errors in n binomial trials (p = error prob)]
jpayne@69 661 // > Limit
jpayne@69 662
jpayne@69 663 {
jpayne@69 664 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
jpayne@69 665 double q, Sum, P_Power, Q_Power, X;
jpayne@69 666 int k, n, Bin_Coeff, Ct;
jpayne@69 667
jpayne@69 668 q = 1.0 - p;
jpayne@69 669 if (Start < e)
jpayne@69 670 Start = e;
jpayne@69 671
jpayne@69 672 for (n = Start; n < MAX_FRAG_LEN; n ++)
jpayne@69 673 {
jpayne@69 674 if (n <= 35)
jpayne@69 675 {
jpayne@69 676 Sum = 0.0;
jpayne@69 677 Bin_Coeff = 1;
jpayne@69 678 Ct = 0;
jpayne@69 679 P_Power = 1.0;
jpayne@69 680 Q_Power = pow (q, (double) n);
jpayne@69 681
jpayne@69 682 for (k = 0; k < e && 1.0 - Sum > Limit; k ++)
jpayne@69 683 {
jpayne@69 684 X = Bin_Coeff * P_Power * Q_Power;
jpayne@69 685 Sum += X;
jpayne@69 686 Bin_Coeff *= n - Ct;
jpayne@69 687 Bin_Coeff /= ++ Ct;
jpayne@69 688 P_Power *= p;
jpayne@69 689 Q_Power /= q;
jpayne@69 690 }
jpayne@69 691 if (1.0 - Sum > Limit)
jpayne@69 692 return n;
jpayne@69 693 }
jpayne@69 694 else
jpayne@69 695 {
jpayne@69 696 Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q));
jpayne@69 697 if (Normal_Z <= NORMAL_DISTRIB_THOLD)
jpayne@69 698 return n;
jpayne@69 699 Sum = 0.0;
jpayne@69 700 Mu_Power = 1.0;
jpayne@69 701 Factorial = 1.0;
jpayne@69 702 Poisson_Coeff = exp (- (double) n * p);
jpayne@69 703 for (k = 0; k < e; k ++)
jpayne@69 704 {
jpayne@69 705 Sum += Mu_Power * Poisson_Coeff / Factorial;
jpayne@69 706 Mu_Power *= n * p;
jpayne@69 707 Factorial *= k + 1;
jpayne@69 708 }
jpayne@69 709 if (1.0 - Sum > Limit)
jpayne@69 710 return n;
jpayne@69 711 }
jpayne@69 712 }
jpayne@69 713
jpayne@69 714 return MAX_FRAG_LEN;
jpayne@69 715 }
jpayne@69 716
jpayne@69 717
jpayne@69 718
jpayne@69 719 #define DISPLAY_WIDTH 60
jpayne@69 720
jpayne@69 721 void Display_Alignment
jpayne@69 722 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
jpayne@69 723
jpayne@69 724 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
jpayne@69 725 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
jpayne@69 726
jpayne@69 727 {
jpayne@69 728 int i, j, k, m, top_len, bottom_len;
jpayne@69 729 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 730 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 731
jpayne@69 732 i = j = top_len = bottom_len = 0;
jpayne@69 733 for (k = 0; k < delta_ct; k ++)
jpayne@69 734 {
jpayne@69 735 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 736 {
jpayne@69 737 top [top_len ++] = a [i ++];
jpayne@69 738 j ++;
jpayne@69 739 }
jpayne@69 740 if (delta [k] < 0)
jpayne@69 741 {
jpayne@69 742 top [top_len ++] = '-';
jpayne@69 743 j ++;
jpayne@69 744 }
jpayne@69 745 else
jpayne@69 746 {
jpayne@69 747 top [top_len ++] = a [i ++];
jpayne@69 748 }
jpayne@69 749 }
jpayne@69 750 while (i < a_len && j < b_len)
jpayne@69 751 {
jpayne@69 752 top [top_len ++] = a [i ++];
jpayne@69 753 j ++;
jpayne@69 754 }
jpayne@69 755 top [top_len] = '\0';
jpayne@69 756
jpayne@69 757
jpayne@69 758 i = j = 0;
jpayne@69 759 for (k = 0; k < delta_ct; k ++)
jpayne@69 760 {
jpayne@69 761 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 762 {
jpayne@69 763 bottom [bottom_len ++] = b [j ++];
jpayne@69 764 i ++;
jpayne@69 765 }
jpayne@69 766 if (delta [k] > 0)
jpayne@69 767 {
jpayne@69 768 bottom [bottom_len ++] = '-';
jpayne@69 769 i ++;
jpayne@69 770 }
jpayne@69 771 else
jpayne@69 772 {
jpayne@69 773 bottom [bottom_len ++] = b [j ++];
jpayne@69 774 }
jpayne@69 775 }
jpayne@69 776 while (j < b_len && i < a_len)
jpayne@69 777 {
jpayne@69 778 bottom [bottom_len ++] = b [j ++];
jpayne@69 779 i ++;
jpayne@69 780 }
jpayne@69 781 bottom [bottom_len] = '\0';
jpayne@69 782
jpayne@69 783
jpayne@69 784 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
jpayne@69 785 {
jpayne@69 786 printf ("A: ");
jpayne@69 787 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
jpayne@69 788 putchar (top [i + j]);
jpayne@69 789 putchar ('\n');
jpayne@69 790 printf ("B: ");
jpayne@69 791 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
jpayne@69 792 putchar (bottom [i + j]);
jpayne@69 793 putchar ('\n');
jpayne@69 794 printf (" ");
jpayne@69 795 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
jpayne@69 796 j ++)
jpayne@69 797 if (top [i + j] != ' ' && bottom [i + j] != ' '
jpayne@69 798 && top [i + j] != bottom [i + j])
jpayne@69 799 putchar ('^');
jpayne@69 800 else
jpayne@69 801 putchar (' ');
jpayne@69 802 putchar ('\n');
jpayne@69 803 }
jpayne@69 804
jpayne@69 805 return;
jpayne@69 806 }
jpayne@69 807
jpayne@69 808
jpayne@69 809
jpayne@69 810 void Display_Alignment_With_Pad
jpayne@69 811 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 812 char * left_pad [3], char * right_pad [3], int pad_len)
jpayne@69 813
jpayne@69 814 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
jpayne@69 815 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
jpayne@69 816 // Attach pad strings in left_pad to the beginning of the alignment
jpayne@69 817 // and the strings in right_pad to the end of the alignment.
jpayne@69 818 // pad_len is the width of the pad strings
jpayne@69 819
jpayne@69 820 {
jpayne@69 821 int i, j, k, m, top_len, bottom_len;
jpayne@69 822 #if 0
jpayne@69 823 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 824 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 825 #else
jpayne@69 826 string top, bottom;
jpayne@69 827 #endif
jpayne@69 828
jpayne@69 829 for (k = 0; k < pad_len; k ++)
jpayne@69 830 {
jpayne@69 831 top . push_back (left_pad [0] [k]);
jpayne@69 832 bottom . push_back (left_pad [1] [k]);
jpayne@69 833 }
jpayne@69 834
jpayne@69 835 i = j = 0;
jpayne@69 836 for (k = 0; k < delta_ct; k ++)
jpayne@69 837 {
jpayne@69 838 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 839 {
jpayne@69 840 top . push_back (a [i ++]);
jpayne@69 841 j ++;
jpayne@69 842 }
jpayne@69 843 if (delta [k] < 0)
jpayne@69 844 {
jpayne@69 845 top . push_back ('-');
jpayne@69 846 j ++;
jpayne@69 847 }
jpayne@69 848 else
jpayne@69 849 {
jpayne@69 850 top . push_back (a [i ++]);
jpayne@69 851 }
jpayne@69 852 }
jpayne@69 853 while (i < a_len && j < b_len)
jpayne@69 854 {
jpayne@69 855 top . push_back (a [i ++]);
jpayne@69 856 j ++;
jpayne@69 857 }
jpayne@69 858
jpayne@69 859 i = j = 0;
jpayne@69 860 for (k = 0; k < delta_ct; k ++)
jpayne@69 861 {
jpayne@69 862 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 863 {
jpayne@69 864 bottom . push_back (b [j ++]);
jpayne@69 865 i ++;
jpayne@69 866 }
jpayne@69 867 if (delta [k] > 0)
jpayne@69 868 {
jpayne@69 869 bottom . push_back ('-');
jpayne@69 870 i ++;
jpayne@69 871 }
jpayne@69 872 else
jpayne@69 873 {
jpayne@69 874 bottom . push_back (b [j ++]);
jpayne@69 875 }
jpayne@69 876 }
jpayne@69 877 while (j < b_len && i < a_len)
jpayne@69 878 {
jpayne@69 879 bottom . push_back (b [j ++]);
jpayne@69 880 i ++;
jpayne@69 881 }
jpayne@69 882
jpayne@69 883 for (k = 0; k < pad_len; k ++)
jpayne@69 884 {
jpayne@69 885 top . push_back (right_pad [0] [k]);
jpayne@69 886 bottom . push_back (right_pad [1] [k]);
jpayne@69 887 }
jpayne@69 888
jpayne@69 889 top_len = top . length ();
jpayne@69 890 bottom_len = bottom . length ();
jpayne@69 891
jpayne@69 892 if (top_len != bottom_len)
jpayne@69 893 {
jpayne@69 894 fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n",
jpayne@69 895 top_len, bottom_len);
jpayne@69 896 exit (EXIT_FAILURE);
jpayne@69 897 }
jpayne@69 898
jpayne@69 899 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
jpayne@69 900 {
jpayne@69 901 printf ("A: ");
jpayne@69 902 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
jpayne@69 903 putchar (top [i + j]);
jpayne@69 904 putchar ('\n');
jpayne@69 905 printf ("B: ");
jpayne@69 906 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
jpayne@69 907 putchar (bottom [i + j]);
jpayne@69 908 putchar ('\n');
jpayne@69 909 printf (" ");
jpayne@69 910 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
jpayne@69 911 j ++)
jpayne@69 912 if (i + j < pad_len)
jpayne@69 913 putchar (left_pad [2] [i + j]);
jpayne@69 914 else if (top_len - i - j <= pad_len)
jpayne@69 915 putchar (right_pad [2] [i + j - (top_len - pad_len)]);
jpayne@69 916 else if (top [i + j] != ' ' && bottom [i + j] != ' '
jpayne@69 917 && top [i + j] != bottom [i + j])
jpayne@69 918 putchar ('^');
jpayne@69 919 else
jpayne@69 920 putchar (' ');
jpayne@69 921 putchar ('\n');
jpayne@69 922 }
jpayne@69 923
jpayne@69 924 return;
jpayne@69 925 }
jpayne@69 926
jpayne@69 927
jpayne@69 928
jpayne@69 929 void Display_Difference_Positions
jpayne@69 930 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
jpayne@69 931 int b_incr, int delta [], int delta_ct, const char * a_tag,
jpayne@69 932 const char * b_tag)
jpayne@69 933
jpayne@69 934 // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)]
jpayne@69 935 // between strings a [0 .. (a_len - 1)] and
jpayne@69 936 // b [0 .. (b_len - 1))] . The positions of the starts of the strings
jpayne@69 937 // are at a_start and b_start , respectively, and b_incr indicates
jpayne@69 938 // if the b postions increase or decrease. b_incr must be 1 or -1.
jpayne@69 939 // Use a_tag and b_tag to label the positions.
jpayne@69 940
jpayne@69 941 {
jpayne@69 942 int i, j, k, m;
jpayne@69 943
jpayne@69 944 assert (b_incr == 1 || b_incr == -1);
jpayne@69 945
jpayne@69 946 i = j = 0;
jpayne@69 947 for (k = 0; k < delta_ct; k ++)
jpayne@69 948 {
jpayne@69 949 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 950 {
jpayne@69 951 if (a [i] != b [j])
jpayne@69 952 printf ("%-15s %-15s %8d %8d %c %c\n",
jpayne@69 953 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
jpayne@69 954 i ++;
jpayne@69 955 j ++;
jpayne@69 956 }
jpayne@69 957 if (delta [k] < 0)
jpayne@69 958 {
jpayne@69 959 printf ("%-15s %-15s %8d %8d %c %c\n",
jpayne@69 960 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]);
jpayne@69 961 j ++;
jpayne@69 962 }
jpayne@69 963 else
jpayne@69 964 {
jpayne@69 965 if (b_incr > 0)
jpayne@69 966 printf ("%-15s %-15s %8d %8d %c %c\n",
jpayne@69 967 a_tag, b_tag, a_start + i, b_start + j, a [i], '-');
jpayne@69 968 else
jpayne@69 969 printf ("%-15s %-15s %8d %8d %c %c\n",
jpayne@69 970 a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-');
jpayne@69 971 i ++;
jpayne@69 972 }
jpayne@69 973 }
jpayne@69 974 while (i < a_len && j < b_len)
jpayne@69 975 {
jpayne@69 976 if (a [i] != b [j])
jpayne@69 977 printf ("%-15s %-15s %8d %8d %c %c\n",
jpayne@69 978 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
jpayne@69 979 i ++;
jpayne@69 980 j ++;
jpayne@69 981 }
jpayne@69 982
jpayne@69 983 return;
jpayne@69 984 }
jpayne@69 985
jpayne@69 986
jpayne@69 987
jpayne@69 988 int Extend_Backward
jpayne@69 989 (long int * ref_lo, long int * query_lo)
jpayne@69 990
jpayne@69 991 // Do edit distance off end of match beginning at (* ref_lo) and
jpayne@69 992 // (* query_lo) in the reference and query sequences, resp.,
jpayne@69 993 // in the reverse direction.
jpayne@69 994 // Return the number of errors in the extension and set (* ref_hi)
jpayne@69 995 // and (* query_hi) to the end of the extension.
jpayne@69 996
jpayne@69 997 {
jpayne@69 998 int ct = 0, errors, sum = 0;
jpayne@69 999 int a_len, b_len;
jpayne@69 1000 int a_end, b_end, match_to_end;
jpayne@69 1001 int delta [MAX_ERRORS], delta_len;
jpayne@69 1002
jpayne@69 1003 do
jpayne@69 1004 {
jpayne@69 1005 a_len = (* ref_lo) - 1;
jpayne@69 1006 if (a_len > MAX_EXTENSION)
jpayne@69 1007 a_len = MAX_EXTENSION;
jpayne@69 1008 b_len = (* query_lo) - 1;
jpayne@69 1009 if (b_len > MAX_EXTENSION)
jpayne@69 1010 b_len = MAX_EXTENSION;
jpayne@69 1011
jpayne@69 1012 errors = Rev_Prefix_Edit_Dist
jpayne@69 1013 (Ref + (* ref_lo) - 1, a_len,
jpayne@69 1014 Query + (* query_lo) - 1, b_len,
jpayne@69 1015 MAX_ERRORS - 1, & a_end, & b_end,
jpayne@69 1016 & match_to_end, delta, & delta_len, TRUE);
jpayne@69 1017 if (Show_Differences)
jpayne@69 1018 Rev_Show_Diffs
jpayne@69 1019 (Ref + (* ref_lo - 1), a_end,
jpayne@69 1020 Query + (* query_lo - 1), b_end,
jpayne@69 1021 delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1);
jpayne@69 1022 if (Verbose > 0)
jpayne@69 1023 {
jpayne@69 1024 printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
jpayne@69 1025 ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len,
jpayne@69 1026 errors, match_to_end ? 'T' : 'F', a_end, b_end);
jpayne@69 1027 Rev_Display_Alignment
jpayne@69 1028 (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end,
jpayne@69 1029 delta, delta_len);
jpayne@69 1030 }
jpayne@69 1031
jpayne@69 1032 (* ref_lo) -= a_end;
jpayne@69 1033 (* query_lo) -= b_end;
jpayne@69 1034 sum += errors;
jpayne@69 1035 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
jpayne@69 1036
jpayne@69 1037 return sum;
jpayne@69 1038 }
jpayne@69 1039
jpayne@69 1040
jpayne@69 1041
jpayne@69 1042 int Extend_Forward
jpayne@69 1043 (long int * ref_hi, long int * query_hi)
jpayne@69 1044
jpayne@69 1045 // Do edit distance off end of match ending at (* ref_hi) and
jpayne@69 1046 // (* query_hi) in the reference and query sequences, resp.
jpayne@69 1047 // Return the number of errors in the extension and set (* ref_hi)
jpayne@69 1048 // and (* query_hi) to the end of the extension.
jpayne@69 1049
jpayne@69 1050 {
jpayne@69 1051 int ct = 0, errors, sum = 0;
jpayne@69 1052 int a_end, b_end, match_to_end;
jpayne@69 1053 int a_len, b_len;
jpayne@69 1054 int delta [MAX_ERRORS], delta_len;
jpayne@69 1055
jpayne@69 1056 do
jpayne@69 1057 {
jpayne@69 1058 a_len = Ref_Len - (* ref_hi);
jpayne@69 1059 if (a_len > MAX_EXTENSION)
jpayne@69 1060 a_len = MAX_EXTENSION;
jpayne@69 1061 b_len = Query_Len - (* query_hi);
jpayne@69 1062 if (b_len > MAX_EXTENSION)
jpayne@69 1063 b_len = MAX_EXTENSION;
jpayne@69 1064
jpayne@69 1065 errors = Prefix_Edit_Dist
jpayne@69 1066 (Ref + (* ref_hi) + 1, a_len,
jpayne@69 1067 Query + (* query_hi) + 1, b_len,
jpayne@69 1068 MAX_ERRORS - 1, & a_end, & b_end,
jpayne@69 1069 & match_to_end, delta, & delta_len, TRUE);
jpayne@69 1070 if (Show_Differences)
jpayne@69 1071 Show_Diffs (Ref + (* ref_hi) + 1, a_end,
jpayne@69 1072 Query + (* query_hi) + 1, b_end,
jpayne@69 1073 delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1);
jpayne@69 1074 if (Verbose > 0)
jpayne@69 1075 {
jpayne@69 1076 printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
jpayne@69 1077 ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len,
jpayne@69 1078 errors, match_to_end ? 'T' : 'F', a_end, b_end);
jpayne@69 1079 Display_Alignment (Ref + (* ref_hi) + 1, a_end,
jpayne@69 1080 Query + (* query_hi) + 1, b_end,
jpayne@69 1081 delta, delta_len);
jpayne@69 1082 }
jpayne@69 1083
jpayne@69 1084 (* ref_hi) += a_end;
jpayne@69 1085 (* query_hi) += b_end;
jpayne@69 1086 sum += errors;
jpayne@69 1087 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
jpayne@69 1088
jpayne@69 1089 return sum;
jpayne@69 1090 }
jpayne@69 1091
jpayne@69 1092
jpayne@69 1093 void Initialize_Globals
jpayne@69 1094 (void)
jpayne@69 1095
jpayne@69 1096 // Initialize global variables used in this program
jpayne@69 1097
jpayne@69 1098 {
jpayne@69 1099 int i, offset, del;
jpayne@69 1100 int e, start;
jpayne@69 1101
jpayne@69 1102 offset = 2;
jpayne@69 1103 del = 6;
jpayne@69 1104 for (i = 0; i < MAX_ERRORS; i ++)
jpayne@69 1105 {
jpayne@69 1106 Edit_Array [i] = Edit_Space + offset;
jpayne@69 1107 offset += del;
jpayne@69 1108 del += 2;
jpayne@69 1109 }
jpayne@69 1110
jpayne@69 1111
jpayne@69 1112 for (i = 0; i <= ERRORS_FOR_FREE; i ++)
jpayne@69 1113 Edit_Match_Limit [i] = 0;
jpayne@69 1114
jpayne@69 1115 start = 1;
jpayne@69 1116 for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++)
jpayne@69 1117 {
jpayne@69 1118 start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE,
jpayne@69 1119 start, EDIT_DIST_PROB_BOUND);
jpayne@69 1120 Edit_Match_Limit [e] = start - 1;
jpayne@69 1121 assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]);
jpayne@69 1122 }
jpayne@69 1123
jpayne@69 1124 for (i = 0; i <= MAX_FRAG_LEN; i ++)
jpayne@69 1125 Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE));
jpayne@69 1126
jpayne@69 1127 return;
jpayne@69 1128 }
jpayne@69 1129
jpayne@69 1130
jpayne@69 1131
jpayne@69 1132 FILE * Local_File_Open
jpayne@69 1133 (const char * filename, const char * mode)
jpayne@69 1134
jpayne@69 1135 /* Open Filename in Mode and return a pointer to its control
jpayne@69 1136 * block. If fail, print a message and exit. */
jpayne@69 1137
jpayne@69 1138 {
jpayne@69 1139 FILE * fp;
jpayne@69 1140
jpayne@69 1141 fp = fopen (filename, mode);
jpayne@69 1142 if (fp == NULL)
jpayne@69 1143 {
jpayne@69 1144 fprintf (stderr, "ERROR %d: Could not open file %s \n",
jpayne@69 1145 errno, filename);
jpayne@69 1146 perror (strerror (errno));
jpayne@69 1147 exit (EXIT_FAILURE);
jpayne@69 1148 }
jpayne@69 1149
jpayne@69 1150 return fp;
jpayne@69 1151 }
jpayne@69 1152
jpayne@69 1153
jpayne@69 1154
jpayne@69 1155 int Max_int
jpayne@69 1156 (int a, int b)
jpayne@69 1157
jpayne@69 1158 // Return the larger of a and b .
jpayne@69 1159
jpayne@69 1160 {
jpayne@69 1161 if (a < b)
jpayne@69 1162 return b;
jpayne@69 1163
jpayne@69 1164 return a;
jpayne@69 1165 }
jpayne@69 1166
jpayne@69 1167
jpayne@69 1168
jpayne@69 1169 int Min_int
jpayne@69 1170 (int a, int b)
jpayne@69 1171
jpayne@69 1172 // Return the smaller of a and b .
jpayne@69 1173
jpayne@69 1174 {
jpayne@69 1175 if (a < b)
jpayne@69 1176 return a;
jpayne@69 1177
jpayne@69 1178 return b;
jpayne@69 1179 }
jpayne@69 1180
jpayne@69 1181
jpayne@69 1182
jpayne@69 1183 void Parse_Command_Line
jpayne@69 1184 (int argc, char * argv [])
jpayne@69 1185
jpayne@69 1186 // Get options and parameters from command line with argc
jpayne@69 1187 // arguments in argv [0 .. (argc - 1)] .
jpayne@69 1188
jpayne@69 1189 {
jpayne@69 1190 int ch, errflg = FALSE;
jpayne@69 1191
jpayne@69 1192 optarg = NULL;
jpayne@69 1193
jpayne@69 1194 while (! errflg
jpayne@69 1195 && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF))
jpayne@69 1196 switch (ch)
jpayne@69 1197 {
jpayne@69 1198 case 'D' :
jpayne@69 1199 Only_Difference_Positions = true;
jpayne@69 1200 break;
jpayne@69 1201
jpayne@69 1202 case 'e' :
jpayne@69 1203 Percent_ID = 1.0 - atof (optarg);
jpayne@69 1204 UserScoring = TRUE;
jpayne@69 1205 break;
jpayne@69 1206
jpayne@69 1207 case 'n' :
jpayne@69 1208 Nucleotides_Only = TRUE;
jpayne@69 1209 break;
jpayne@69 1210
jpayne@69 1211 case 'N' :
jpayne@69 1212 Consec_Non_ACGT = strtol (optarg, NULL, 10);
jpayne@69 1213 break;
jpayne@69 1214
jpayne@69 1215 case 'q' :
jpayne@69 1216 Query_Suffix = optarg;
jpayne@69 1217 break;
jpayne@69 1218
jpayne@69 1219 case 'r' :
jpayne@69 1220 Ref_Suffix = optarg;
jpayne@69 1221 break;
jpayne@69 1222
jpayne@69 1223 case 'S' :
jpayne@69 1224 Show_Differences = TRUE;
jpayne@69 1225 break;
jpayne@69 1226
jpayne@69 1227 case 't' :
jpayne@69 1228 Tag_From_Fasta_Line = TRUE;
jpayne@69 1229 break;
jpayne@69 1230
jpayne@69 1231 case 'v' :
jpayne@69 1232 Verbose = strtol (optarg, NULL, 10);
jpayne@69 1233 break;
jpayne@69 1234
jpayne@69 1235 case 'x' :
jpayne@69 1236 Output_Cover_Files = FALSE;
jpayne@69 1237 break;
jpayne@69 1238
jpayne@69 1239 case 'W' :
jpayne@69 1240 Error_File_Name = optarg;
jpayne@69 1241 break;
jpayne@69 1242
jpayne@69 1243 case '?' :
jpayne@69 1244 fprintf (stderr, "Unrecognized option -%c\n", optopt);
jpayne@69 1245
jpayne@69 1246 default :
jpayne@69 1247 errflg = TRUE;
jpayne@69 1248 }
jpayne@69 1249
jpayne@69 1250 if (errflg || optind != argc - 3)
jpayne@69 1251 {
jpayne@69 1252 Usage (argv [0]);
jpayne@69 1253 exit (EXIT_FAILURE);
jpayne@69 1254 }
jpayne@69 1255
jpayne@69 1256 Ref_File_Path = argv [optind ++];
jpayne@69 1257
jpayne@69 1258 Match_File_Path = argv [optind ++];
jpayne@69 1259
jpayne@69 1260 Gaps_File_Path = argv [optind ++];
jpayne@69 1261
jpayne@69 1262 return;
jpayne@69 1263 }
jpayne@69 1264
jpayne@69 1265
jpayne@69 1266
jpayne@69 1267 int Prefix_Edit_Dist
jpayne@69 1268 (char A [], int m, char T [], int n, int Error_Limit,
jpayne@69 1269 int * A_End, int * T_End, int * Match_To_End,
jpayne@69 1270 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
jpayne@69 1271
jpayne@69 1272 // Return the minimum number of changes (inserts, deletes, replacements)
jpayne@69 1273 // needed to match string A [0 .. (m-1)] with a prefix of string
jpayne@69 1274 // T [0 .. (n-1)] if it's not more than Error_Limit .
jpayne@69 1275 // Put delta description of alignment in Delta and set
jpayne@69 1276 // (* Delta_Len) to the number of entries there if it's a complete
jpayne@69 1277 // match.
jpayne@69 1278 // Set A_End and T_End to the rightmost positions where the
jpayne@69 1279 // alignment ended in A and T , respectively.
jpayne@69 1280 // Set Match_To_End true if the match extended to the end
jpayne@69 1281 // of at least one string; otherwise, set it false to indicate
jpayne@69 1282 // a branch point.
jpayne@69 1283 // extending indicates whether the match is for a fixed region (FALSE)
jpayne@69 1284 // or is extending off the end of a match as far as possible (TRUE)
jpayne@69 1285
jpayne@69 1286 {
jpayne@69 1287 double Score, Max_Score;
jpayne@69 1288 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
jpayne@69 1289 int Best_d, Best_e, Longest, Row;
jpayne@69 1290 int Left, Right;
jpayne@69 1291 int d, e, i, j, shorter;
jpayne@69 1292
jpayne@69 1293 // assert (m <= n);
jpayne@69 1294 Best_d = Best_e = Longest = 0;
jpayne@69 1295 (* Delta_Len) = 0;
jpayne@69 1296
jpayne@69 1297 if (Consec_Non_ACGT > 0)
jpayne@69 1298 {
jpayne@69 1299 int ct;
jpayne@69 1300
jpayne@69 1301 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
jpayne@69 1302 {
jpayne@69 1303 switch (A [i])
jpayne@69 1304 {
jpayne@69 1305 case 'a' :
jpayne@69 1306 case 'c' :
jpayne@69 1307 case 'g' :
jpayne@69 1308 case 't' :
jpayne@69 1309 ct = 0;
jpayne@69 1310 break;
jpayne@69 1311 default :
jpayne@69 1312 ct ++;
jpayne@69 1313 }
jpayne@69 1314 }
jpayne@69 1315 if (ct >= Consec_Non_ACGT)
jpayne@69 1316 {
jpayne@69 1317 m = i - ct;
jpayne@69 1318 extending = TRUE;
jpayne@69 1319 }
jpayne@69 1320
jpayne@69 1321 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
jpayne@69 1322 {
jpayne@69 1323 switch (T [i])
jpayne@69 1324 {
jpayne@69 1325 case 'a' :
jpayne@69 1326 case 'c' :
jpayne@69 1327 case 'g' :
jpayne@69 1328 case 't' :
jpayne@69 1329 ct = 0;
jpayne@69 1330 break;
jpayne@69 1331 default :
jpayne@69 1332 ct ++;
jpayne@69 1333 }
jpayne@69 1334 }
jpayne@69 1335 if (ct >= Consec_Non_ACGT)
jpayne@69 1336 {
jpayne@69 1337 n = i - ct;
jpayne@69 1338 extending = TRUE;
jpayne@69 1339 }
jpayne@69 1340 }
jpayne@69 1341
jpayne@69 1342 shorter = Min_int (m, n);
jpayne@69 1343 for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++)
jpayne@69 1344 ;
jpayne@69 1345
jpayne@69 1346 Edit_Array [0] [0] = Row;
jpayne@69 1347
jpayne@69 1348 if (Row == m && Row == n) // Exact match
jpayne@69 1349 {
jpayne@69 1350 (* Delta_Len) = 0;
jpayne@69 1351 (* A_End) = (* T_End) = Row;
jpayne@69 1352 (* Match_To_End) = ! extending;
jpayne@69 1353 return 0;
jpayne@69 1354 }
jpayne@69 1355
jpayne@69 1356 Left = Right = 0;
jpayne@69 1357 Max_Score = Row * Branch_Pt_Match_Value;
jpayne@69 1358 Max_Score_Len = Row;
jpayne@69 1359 Max_Score_Best_d = 0;
jpayne@69 1360 Max_Score_Best_e = 0;
jpayne@69 1361
jpayne@69 1362 for (e = 1; e <= Error_Limit; e ++)
jpayne@69 1363 {
jpayne@69 1364 int cutoff;
jpayne@69 1365
jpayne@69 1366 Left = Max_int (Left - 1, -e);
jpayne@69 1367 Right = Min_int (Right + 1, e);
jpayne@69 1368 Edit_Array [e - 1] [Left] = -2;
jpayne@69 1369 Edit_Array [e - 1] [Left - 1] = -2;
jpayne@69 1370 Edit_Array [e - 1] [Right] = -2;
jpayne@69 1371 Edit_Array [e - 1] [Right + 1] = -2;
jpayne@69 1372
jpayne@69 1373 for (d = Left; d <= Right; d ++)
jpayne@69 1374 {
jpayne@69 1375 Row = 1 + Edit_Array [e - 1] [d];
jpayne@69 1376 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
jpayne@69 1377 Row = j;
jpayne@69 1378 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
jpayne@69 1379 Row = j;
jpayne@69 1380 while (Row < m && Row + d < n
jpayne@69 1381 && A [Row] == T [Row + d])
jpayne@69 1382 Row ++;
jpayne@69 1383
jpayne@69 1384 Edit_Array [e] [d] = Row;
jpayne@69 1385
jpayne@69 1386 if (Row == m && Row + d == n)
jpayne@69 1387 {
jpayne@69 1388 if (extending)
jpayne@69 1389 {
jpayne@69 1390 for (j = Left; j <= d; j ++)
jpayne@69 1391 if (Edit_Array [e] [j] > Longest)
jpayne@69 1392 {
jpayne@69 1393 Best_d = j;
jpayne@69 1394 Longest = Edit_Array [e] [j];
jpayne@69 1395 }
jpayne@69 1396 Score = Longest * Branch_Pt_Match_Value - e;
jpayne@69 1397 if (Score < Max_Score)
jpayne@69 1398 {
jpayne@69 1399 (* A_End) = Max_Score_Len;
jpayne@69 1400 (* T_End) = Max_Score_Len + Max_Score_Best_d;
jpayne@69 1401 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
jpayne@69 1402 Max_Score_Best_d, Max_Score_Best_e);
jpayne@69 1403 (* Match_To_End) = FALSE;
jpayne@69 1404 return Max_Score_Best_e;
jpayne@69 1405 }
jpayne@69 1406 }
jpayne@69 1407
jpayne@69 1408 (* A_End) = Row; // One past last align position
jpayne@69 1409 (* T_End) = Row + d;
jpayne@69 1410 Set_Deltas (Delta, Delta_Len, Row, d, e);
jpayne@69 1411 (* Match_To_End) = ! extending;
jpayne@69 1412 return e;
jpayne@69 1413 }
jpayne@69 1414 }
jpayne@69 1415
jpayne@69 1416 cutoff = Longest - GIVE_UP_LEN;
jpayne@69 1417 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
jpayne@69 1418 Left ++;
jpayne@69 1419 if (Left > Right)
jpayne@69 1420 break;
jpayne@69 1421 while (Right > Left && Edit_Array [e] [Right] < cutoff)
jpayne@69 1422 Right --;
jpayne@69 1423
jpayne@69 1424 assert (Left <= Right);
jpayne@69 1425
jpayne@69 1426 for (d = Left; d <= Right; d ++)
jpayne@69 1427 if (Edit_Array [e] [d] > Longest)
jpayne@69 1428 {
jpayne@69 1429 Best_d = d;
jpayne@69 1430 Best_e = e;
jpayne@69 1431 Longest = Edit_Array [e] [d];
jpayne@69 1432 }
jpayne@69 1433
jpayne@69 1434 Score = Longest * Branch_Pt_Match_Value - e;
jpayne@69 1435 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
jpayne@69 1436 if (Score > Max_Score)
jpayne@69 1437 {
jpayne@69 1438 Max_Score = Score;
jpayne@69 1439 Max_Score_Len = Longest;
jpayne@69 1440 Max_Score_Best_d = Best_d;
jpayne@69 1441 Max_Score_Best_e = Best_e;
jpayne@69 1442 }
jpayne@69 1443
jpayne@69 1444 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
jpayne@69 1445 break;
jpayne@69 1446 }
jpayne@69 1447
jpayne@69 1448 (* A_End) = Max_Score_Len;
jpayne@69 1449 (* T_End) = Max_Score_Len + Max_Score_Best_d;
jpayne@69 1450 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
jpayne@69 1451 (* Match_To_End) = FALSE;
jpayne@69 1452
jpayne@69 1453 return Max_Score_Best_e;
jpayne@69 1454 }
jpayne@69 1455
jpayne@69 1456
jpayne@69 1457
jpayne@69 1458 int Read_String
jpayne@69 1459 (FILE * fp, char * * T, long int * Size, char header [])
jpayne@69 1460
jpayne@69 1461 /* Read next string from fp (assuming FASTA format) into (* T) [1 ..]
jpayne@69 1462 * which has Size characters. Allocate extra memory if needed
jpayne@69 1463 * and adjust Size accordingly. Return TRUE if successful, FALSE
jpayne@69 1464 * otherwise (e.g., EOF). Set header to the contents of the FASTA
jpayne@69 1465 * header line. */
jpayne@69 1466
jpayne@69 1467 {
jpayne@69 1468 char Line [MAX_LINE];
jpayne@69 1469 long int Len;
jpayne@69 1470 int Ch, Ct;
jpayne@69 1471
jpayne@69 1472 if (feof (fp))
jpayne@69 1473 return FALSE;
jpayne@69 1474
jpayne@69 1475 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
jpayne@69 1476 ;
jpayne@69 1477
jpayne@69 1478 if (Ch != '>')
jpayne@69 1479 return FALSE;
jpayne@69 1480
jpayne@69 1481 fgets (Line, MAX_LINE, fp);
jpayne@69 1482 Len = strlen (Line);
jpayne@69 1483 assert (Len > 0 && Line [Len - 1] == '\n');
jpayne@69 1484 Line [Len - 1] = '\0';
jpayne@69 1485 strcpy (header, Line);
jpayne@69 1486
jpayne@69 1487
jpayne@69 1488 Ct = 0;
jpayne@69 1489 Len = 0;
jpayne@69 1490 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
jpayne@69 1491 {
jpayne@69 1492 if (isspace (Ch))
jpayne@69 1493 continue;
jpayne@69 1494
jpayne@69 1495 Ct ++;
jpayne@69 1496
jpayne@69 1497 if (Ct >= (* Size) - 1)
jpayne@69 1498 {
jpayne@69 1499 (* Size) = (long int) ((* Size) * EXPANSION_FACTOR);
jpayne@69 1500 (* T) = (char *) Safe_realloc ((* T), (* Size));
jpayne@69 1501 }
jpayne@69 1502 Ch = tolower (Ch);
jpayne@69 1503 if (! isalpha (Ch))
jpayne@69 1504 {
jpayne@69 1505 fprintf (stderr, "Unexpected character `%c\' in string %s\n",
jpayne@69 1506 Ch, header);
jpayne@69 1507 Ch = 'x';
jpayne@69 1508 }
jpayne@69 1509 (* T) [++ Len] = Ch;
jpayne@69 1510 }
jpayne@69 1511
jpayne@69 1512 (* T) [Len + 1] = '\0';
jpayne@69 1513 if (Ch == '>')
jpayne@69 1514 ungetc (Ch, fp);
jpayne@69 1515
jpayne@69 1516 return TRUE;
jpayne@69 1517 }
jpayne@69 1518
jpayne@69 1519
jpayne@69 1520
jpayne@69 1521 void Rev_Complement
jpayne@69 1522 (char * s)
jpayne@69 1523
jpayne@69 1524 /* Set string s to its DNA reverse complement. */
jpayne@69 1525
jpayne@69 1526 {
jpayne@69 1527 char ch;
jpayne@69 1528 int i, j, len;
jpayne@69 1529
jpayne@69 1530 len = strlen (s);
jpayne@69 1531
jpayne@69 1532 for (i = 0, j = len - 1; i < j; i ++, j --)
jpayne@69 1533 {
jpayne@69 1534 ch = Complement (s [i]);
jpayne@69 1535 s [i] = Complement (s [j]);
jpayne@69 1536 s [j] = ch;
jpayne@69 1537 }
jpayne@69 1538
jpayne@69 1539 if (i == j)
jpayne@69 1540 s [i] = Complement (s [i]);
jpayne@69 1541
jpayne@69 1542 return;
jpayne@69 1543 }
jpayne@69 1544
jpayne@69 1545
jpayne@69 1546
jpayne@69 1547 void Rev_Display_Alignment
jpayne@69 1548 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
jpayne@69 1549
jpayne@69 1550 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
jpayne@69 1551 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
jpayne@69 1552 // in the reverse direction.
jpayne@69 1553
jpayne@69 1554 {
jpayne@69 1555 int i, j, k, m, top_len, bottom_len;
jpayne@69 1556 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 1557 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
jpayne@69 1558
jpayne@69 1559 i = j = top_len = bottom_len = 0;
jpayne@69 1560 for (k = 0; k < delta_ct; k ++)
jpayne@69 1561 {
jpayne@69 1562 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 1563 {
jpayne@69 1564 top [top_len ++] = a [- i ++];
jpayne@69 1565 j ++;
jpayne@69 1566 }
jpayne@69 1567 if (delta [k] < 0)
jpayne@69 1568 {
jpayne@69 1569 top [top_len ++] = '-';
jpayne@69 1570 j ++;
jpayne@69 1571 }
jpayne@69 1572 else
jpayne@69 1573 {
jpayne@69 1574 top [top_len ++] = a [- i ++];
jpayne@69 1575 }
jpayne@69 1576 }
jpayne@69 1577 while (i < a_len && j < b_len)
jpayne@69 1578 {
jpayne@69 1579 top [top_len ++] = a [- i ++];
jpayne@69 1580 j ++;
jpayne@69 1581 }
jpayne@69 1582 top [top_len] = '\0';
jpayne@69 1583
jpayne@69 1584
jpayne@69 1585 i = j = 0;
jpayne@69 1586 for (k = 0; k < delta_ct; k ++)
jpayne@69 1587 {
jpayne@69 1588 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 1589 {
jpayne@69 1590 bottom [bottom_len ++] = b [- j ++];
jpayne@69 1591 i ++;
jpayne@69 1592 }
jpayne@69 1593 if (delta [k] > 0)
jpayne@69 1594 {
jpayne@69 1595 bottom [bottom_len ++] = '-';
jpayne@69 1596 i ++;
jpayne@69 1597 }
jpayne@69 1598 else
jpayne@69 1599 {
jpayne@69 1600 bottom [bottom_len ++] = b [- j ++];
jpayne@69 1601 }
jpayne@69 1602 }
jpayne@69 1603 while (j < b_len && i < a_len)
jpayne@69 1604 {
jpayne@69 1605 bottom [bottom_len ++] = b [- j ++];
jpayne@69 1606 i ++;
jpayne@69 1607 }
jpayne@69 1608 bottom [bottom_len] = '\0';
jpayne@69 1609
jpayne@69 1610
jpayne@69 1611 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
jpayne@69 1612 {
jpayne@69 1613 putchar ('\n');
jpayne@69 1614 printf ("A: ");
jpayne@69 1615 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
jpayne@69 1616 putchar (top [i + j]);
jpayne@69 1617 putchar ('\n');
jpayne@69 1618 printf ("B: ");
jpayne@69 1619 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
jpayne@69 1620 putchar (bottom [i + j]);
jpayne@69 1621 putchar ('\n');
jpayne@69 1622 printf (" ");
jpayne@69 1623 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
jpayne@69 1624 j ++)
jpayne@69 1625 if (top [i + j] != ' ' && bottom [i + j] != ' '
jpayne@69 1626 && top [i + j] != bottom [i + j])
jpayne@69 1627 putchar ('^');
jpayne@69 1628 else
jpayne@69 1629 putchar (' ');
jpayne@69 1630 putchar ('\n');
jpayne@69 1631 }
jpayne@69 1632
jpayne@69 1633 return;
jpayne@69 1634 }
jpayne@69 1635
jpayne@69 1636
jpayne@69 1637
jpayne@69 1638 int Rev_Prefix_Edit_Dist
jpayne@69 1639 (char A [], int m, char T [], int n, int Error_Limit,
jpayne@69 1640 int * A_End, int * T_End, int * Match_To_End,
jpayne@69 1641 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
jpayne@69 1642
jpayne@69 1643 // Return the minimum number of changes (inserts, deletes, replacements)
jpayne@69 1644 // needed to match string A [0 .. -(m-1)] with a prefix of string
jpayne@69 1645 // T [0 .. -(n-1)] if it's not more than Error_Limit .
jpayne@69 1646 // Note: Match is reverse direction, right to left.
jpayne@69 1647 // Put delta description of alignment in Delta and set
jpayne@69 1648 // (* Delta_Len) to the number of entries there if it's a complete
jpayne@69 1649 // match.
jpayne@69 1650 // Set A_End and T_End to the rightmost positions where the
jpayne@69 1651 // alignment ended in A and T , respectively.
jpayne@69 1652 // Set Match_To_End true if the match extended to the end
jpayne@69 1653 // of at least one string; otherwise, set it false to indicate
jpayne@69 1654 // a branch point.
jpayne@69 1655 // extending indicates whether the match is for a fixed region (FALSE)
jpayne@69 1656 // or is extending off the end of a match as far as possible (TRUE)
jpayne@69 1657
jpayne@69 1658 {
jpayne@69 1659 double Score, Max_Score;
jpayne@69 1660 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
jpayne@69 1661 int Best_d, Best_e, Longest, Row;
jpayne@69 1662 int Left, Right;
jpayne@69 1663 int d, e, i, j, shorter;
jpayne@69 1664
jpayne@69 1665 // assert (m <= n);
jpayne@69 1666 Best_d = Best_e = Longest = 0;
jpayne@69 1667 (* Delta_Len) = 0;
jpayne@69 1668
jpayne@69 1669 if (Consec_Non_ACGT > 0)
jpayne@69 1670 {
jpayne@69 1671 int ct;
jpayne@69 1672
jpayne@69 1673 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
jpayne@69 1674 {
jpayne@69 1675 switch (A [- i])
jpayne@69 1676 {
jpayne@69 1677 case 'a' :
jpayne@69 1678 case 'c' :
jpayne@69 1679 case 'g' :
jpayne@69 1680 case 't' :
jpayne@69 1681 ct = 0;
jpayne@69 1682 break;
jpayne@69 1683 default :
jpayne@69 1684 ct ++;
jpayne@69 1685 }
jpayne@69 1686 }
jpayne@69 1687 if (ct >= Consec_Non_ACGT)
jpayne@69 1688 {
jpayne@69 1689 m = i - ct;
jpayne@69 1690 extending = TRUE;
jpayne@69 1691 }
jpayne@69 1692
jpayne@69 1693 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
jpayne@69 1694 {
jpayne@69 1695 switch (T [- i])
jpayne@69 1696 {
jpayne@69 1697 case 'a' :
jpayne@69 1698 case 'c' :
jpayne@69 1699 case 'g' :
jpayne@69 1700 case 't' :
jpayne@69 1701 ct = 0;
jpayne@69 1702 break;
jpayne@69 1703 default :
jpayne@69 1704 ct ++;
jpayne@69 1705 }
jpayne@69 1706 }
jpayne@69 1707 if (ct >= Consec_Non_ACGT)
jpayne@69 1708 {
jpayne@69 1709 n = i - ct;
jpayne@69 1710 extending = TRUE;
jpayne@69 1711 }
jpayne@69 1712 }
jpayne@69 1713
jpayne@69 1714 shorter = Min_int (m, n);
jpayne@69 1715 for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++)
jpayne@69 1716 ;
jpayne@69 1717
jpayne@69 1718 Edit_Array [0] [0] = Row;
jpayne@69 1719
jpayne@69 1720 if (Row == m && Row == n) // Exact match
jpayne@69 1721 {
jpayne@69 1722 (* Delta_Len) = 0;
jpayne@69 1723 (* A_End) = (* T_End) = Row;
jpayne@69 1724 (* Match_To_End) = ! extending;
jpayne@69 1725 return 0;
jpayne@69 1726 }
jpayne@69 1727
jpayne@69 1728 Left = Right = 0;
jpayne@69 1729 Max_Score = Row * Branch_Pt_Match_Value;
jpayne@69 1730 Max_Score_Len = Row;
jpayne@69 1731 Max_Score_Best_d = 0;
jpayne@69 1732 Max_Score_Best_e = 0;
jpayne@69 1733
jpayne@69 1734 for (e = 1; e <= Error_Limit; e ++)
jpayne@69 1735 {
jpayne@69 1736 int cutoff;
jpayne@69 1737
jpayne@69 1738 Left = Max_int (Left - 1, -e);
jpayne@69 1739 Right = Min_int (Right + 1, e);
jpayne@69 1740 Edit_Array [e - 1] [Left] = -2;
jpayne@69 1741 Edit_Array [e - 1] [Left - 1] = -2;
jpayne@69 1742 Edit_Array [e - 1] [Right] = -2;
jpayne@69 1743 Edit_Array [e - 1] [Right + 1] = -2;
jpayne@69 1744
jpayne@69 1745 for (d = Left; d <= Right; d ++)
jpayne@69 1746 {
jpayne@69 1747 Row = 1 + Edit_Array [e - 1] [d];
jpayne@69 1748 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
jpayne@69 1749 Row = j;
jpayne@69 1750 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
jpayne@69 1751 Row = j;
jpayne@69 1752 while (Row < m && Row + d < n
jpayne@69 1753 && A [- Row] == T [- Row - d])
jpayne@69 1754 Row ++;
jpayne@69 1755
jpayne@69 1756 Edit_Array [e] [d] = Row;
jpayne@69 1757
jpayne@69 1758 if (Row == m && Row + d == n)
jpayne@69 1759 {
jpayne@69 1760 if (extending)
jpayne@69 1761 {
jpayne@69 1762 for (j = Left; j <= d; j ++)
jpayne@69 1763 if (Edit_Array [e] [j] > Longest)
jpayne@69 1764 {
jpayne@69 1765 Best_d = j;
jpayne@69 1766 Longest = Edit_Array [e] [j];
jpayne@69 1767 }
jpayne@69 1768 Score = Longest * Branch_Pt_Match_Value - e;
jpayne@69 1769 if (Score < Max_Score)
jpayne@69 1770 {
jpayne@69 1771 (* A_End) = Max_Score_Len;
jpayne@69 1772 (* T_End) = Max_Score_Len + Max_Score_Best_d;
jpayne@69 1773 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
jpayne@69 1774 Max_Score_Best_d, Max_Score_Best_e);
jpayne@69 1775 (* Match_To_End) = FALSE;
jpayne@69 1776 return Max_Score_Best_e;
jpayne@69 1777 }
jpayne@69 1778 }
jpayne@69 1779
jpayne@69 1780 (* A_End) = Row; // One past last align position
jpayne@69 1781 (* T_End) = Row + d;
jpayne@69 1782 Set_Deltas (Delta, Delta_Len, Row, d, e);
jpayne@69 1783 (* Match_To_End) = ! extending;
jpayne@69 1784 return e;
jpayne@69 1785 }
jpayne@69 1786 }
jpayne@69 1787
jpayne@69 1788 cutoff = Longest - GIVE_UP_LEN;
jpayne@69 1789 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
jpayne@69 1790 Left ++;
jpayne@69 1791 if (Left > Right)
jpayne@69 1792 break;
jpayne@69 1793 while (Right > Left && Edit_Array [e] [Right] < cutoff)
jpayne@69 1794 Right --;
jpayne@69 1795 assert (Left <= Right);
jpayne@69 1796
jpayne@69 1797 for (d = Left; d <= Right; d ++)
jpayne@69 1798 if (Edit_Array [e] [d] > Longest)
jpayne@69 1799 {
jpayne@69 1800 Best_d = d;
jpayne@69 1801 Best_e = e;
jpayne@69 1802 Longest = Edit_Array [e] [d];
jpayne@69 1803 }
jpayne@69 1804
jpayne@69 1805 Score = Longest * Branch_Pt_Match_Value - e;
jpayne@69 1806 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
jpayne@69 1807 if (Score > Max_Score)
jpayne@69 1808 {
jpayne@69 1809 Max_Score = Score;
jpayne@69 1810 Max_Score_Len = Longest;
jpayne@69 1811 Max_Score_Best_d = Best_d;
jpayne@69 1812 Max_Score_Best_e = Best_e;
jpayne@69 1813 }
jpayne@69 1814
jpayne@69 1815
jpayne@69 1816 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
jpayne@69 1817 break;
jpayne@69 1818 }
jpayne@69 1819
jpayne@69 1820 (* A_End) = Max_Score_Len;
jpayne@69 1821 (* T_End) = Max_Score_Len + Max_Score_Best_d;
jpayne@69 1822 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
jpayne@69 1823 (* Match_To_End) = FALSE;
jpayne@69 1824
jpayne@69 1825 return Max_Score_Best_e;
jpayne@69 1826 }
jpayne@69 1827
jpayne@69 1828
jpayne@69 1829
jpayne@69 1830 void Rev_Show_Diffs
jpayne@69 1831 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 1832 long int a_ref, long int b_ref)
jpayne@69 1833
jpayne@69 1834 // Show (to stdout ) the differences
jpayne@69 1835 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
jpayne@69 1836 // encoded in delta [0 .. (delta_ct - 1)]
jpayne@69 1837 // in the reverse direction. a_ref is the position of the beginning
jpayne@69 1838 // of string a . b_ref is the offset to the start of
jpayne@69 1839 // string b .
jpayne@69 1840
jpayne@69 1841 {
jpayne@69 1842 int i, j, k, m;
jpayne@69 1843
jpayne@69 1844 i = j = 0;
jpayne@69 1845 for (k = 0; k < delta_ct; k ++)
jpayne@69 1846 {
jpayne@69 1847 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 1848 {
jpayne@69 1849 if (a [- i] != b [- j])
jpayne@69 1850 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
jpayne@69 1851 i ++;
jpayne@69 1852 j ++;
jpayne@69 1853 }
jpayne@69 1854 if (delta [k] < 0)
jpayne@69 1855 {
jpayne@69 1856 printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]);
jpayne@69 1857 j ++;
jpayne@69 1858 }
jpayne@69 1859 else
jpayne@69 1860 {
jpayne@69 1861 printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]);
jpayne@69 1862 i ++;
jpayne@69 1863 }
jpayne@69 1864 }
jpayne@69 1865 while (i < a_len && j < b_len)
jpayne@69 1866 {
jpayne@69 1867 if (a [- i] != b [- j])
jpayne@69 1868 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
jpayne@69 1869 i ++;
jpayne@69 1870 j ++;
jpayne@69 1871 }
jpayne@69 1872
jpayne@69 1873 return;
jpayne@69 1874 }
jpayne@69 1875
jpayne@69 1876
jpayne@69 1877
jpayne@69 1878 void Set_Deltas
jpayne@69 1879 (int delta [], int * delta_len, int row, int d, int e)
jpayne@69 1880
jpayne@69 1881 // Set delta to values that show the alignment recorded in
jpayne@69 1882 // global Edit_Array . Set (* delta_len) to the number of
jpayne@69 1883 // entries set. row is the length of the match in the first
jpayne@69 1884 // string. d is the offset at the end of the match between
jpayne@69 1885 // the first and second string. e is the number of errors.
jpayne@69 1886
jpayne@69 1887 {
jpayne@69 1888 int delta_stack [MAX_ERRORS];
jpayne@69 1889 int from, last, max;
jpayne@69 1890 int i, j, k;
jpayne@69 1891
jpayne@69 1892 last = row;
jpayne@69 1893 (* delta_len) = 0;
jpayne@69 1894
jpayne@69 1895 for (k = e; k > 0; k --)
jpayne@69 1896 {
jpayne@69 1897 from = d;
jpayne@69 1898 max = 1 + Edit_Array [k - 1] [d];
jpayne@69 1899 if ((j = Edit_Array [k - 1] [d - 1]) > max)
jpayne@69 1900 {
jpayne@69 1901 from = d - 1;
jpayne@69 1902 max = j;
jpayne@69 1903 }
jpayne@69 1904 if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max)
jpayne@69 1905 {
jpayne@69 1906 from = d + 1;
jpayne@69 1907 max = j;
jpayne@69 1908 }
jpayne@69 1909 if (from == d - 1)
jpayne@69 1910 {
jpayne@69 1911 delta_stack [(* delta_len) ++] = max - last - 1;
jpayne@69 1912 d --;
jpayne@69 1913 last = Edit_Array [k - 1] [from];
jpayne@69 1914 }
jpayne@69 1915 else if (from == d + 1)
jpayne@69 1916 {
jpayne@69 1917 delta_stack [(* delta_len) ++] = last - (max - 1);
jpayne@69 1918 d ++;
jpayne@69 1919 last = Edit_Array [k - 1] [from];
jpayne@69 1920 }
jpayne@69 1921 }
jpayne@69 1922 delta_stack [(* delta_len) ++] = last + 1;
jpayne@69 1923
jpayne@69 1924 k = 0;
jpayne@69 1925 for (i = (* delta_len) - 1; i > 0; i --)
jpayne@69 1926 delta [k ++]
jpayne@69 1927 = abs (delta_stack [i]) * Sign (delta_stack [i - 1]);
jpayne@69 1928 (* delta_len) --;
jpayne@69 1929
jpayne@69 1930 return;
jpayne@69 1931 }
jpayne@69 1932
jpayne@69 1933
jpayne@69 1934
jpayne@69 1935 static void Set_Left_Pad
jpayne@69 1936 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len)
jpayne@69 1937
jpayne@69 1938 // Set pad to a triple of strings, each of length pad_len ,
jpayne@69 1939 // representing a MUM of length mum_len ending at r_hi in Ref
jpayne@69 1940 // and q_hi in Query . Each string in pad must have been allocated
jpayne@69 1941 // at least pad_len + 1 bytes of memory. The purpose is to serve as
jpayne@69 1942 // the left-end padding for an alignment with this MUM as its left
jpayne@69 1943 // boundary.
jpayne@69 1944
jpayne@69 1945 {
jpayne@69 1946 int i;
jpayne@69 1947
jpayne@69 1948 for (i = 0; i < 3; i ++)
jpayne@69 1949 pad [i] [pad_len] = '\0';
jpayne@69 1950
jpayne@69 1951 for (i = 0; i < pad_len; i ++)
jpayne@69 1952 {
jpayne@69 1953 pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.';
jpayne@69 1954 pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.';
jpayne@69 1955 pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' ';
jpayne@69 1956 }
jpayne@69 1957
jpayne@69 1958 return;
jpayne@69 1959 }
jpayne@69 1960
jpayne@69 1961
jpayne@69 1962
jpayne@69 1963 static void Set_Right_Pad
jpayne@69 1964 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len)
jpayne@69 1965
jpayne@69 1966 // Set pad to a triple of strings, each of length pad_len ,
jpayne@69 1967 // representing a MUM of length mum_len beginning at r_lo in Ref
jpayne@69 1968 // and q_lo in Query . Each string in pad must have been allocated
jpayne@69 1969 // at least pad_len + 1 bytes of memory. The purpose is to serve as
jpayne@69 1970 // the right-end padding for an alignment with this MUM as its right
jpayne@69 1971 // boundary.
jpayne@69 1972
jpayne@69 1973 {
jpayne@69 1974 int i;
jpayne@69 1975
jpayne@69 1976 for (i = 0; i < 3; i ++)
jpayne@69 1977 pad [i] [pad_len] = '\0';
jpayne@69 1978
jpayne@69 1979 for (i = 0; i < pad_len; i ++)
jpayne@69 1980 {
jpayne@69 1981 pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.';
jpayne@69 1982 pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.';
jpayne@69 1983 pad [2] [i] = i < mum_len ? '=' : ' ';
jpayne@69 1984 }
jpayne@69 1985
jpayne@69 1986 return;
jpayne@69 1987 }
jpayne@69 1988
jpayne@69 1989
jpayne@69 1990
jpayne@69 1991 void Show_Coverage
jpayne@69 1992 (Cover_t * list, char * filename, char * tag, char * seq)
jpayne@69 1993
jpayne@69 1994 // Print to stderr summary stats of regions in list .
jpayne@69 1995 // Send to filename a list of region info suitable for celagram .
jpayne@69 1996 // Use tag to print the headings. Regions refer to seq .
jpayne@69 1997
jpayne@69 1998 {
jpayne@69 1999 FILE * fp;
jpayne@69 2000 Cover_t * p;
jpayne@69 2001 int ct = 0;
jpayne@69 2002 long int i, prev_hi = 0, n_ct, len, seq_len;
jpayne@69 2003 long int cov_ns = 0, gap_ns = 0, total_ns;
jpayne@69 2004 double cov_total = 0.0, gap_total = 0.0;
jpayne@69 2005
jpayne@69 2006 fp = Local_File_Open (filename, "w");
jpayne@69 2007 fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n",
jpayne@69 2008 "Region", "Start", "End", "Len", "N's", "%N's");
jpayne@69 2009
jpayne@69 2010 for (p = list; p != NULL; p = p -> next)
jpayne@69 2011 {
jpayne@69 2012 n_ct = 0;
jpayne@69 2013 for (i = prev_hi + 1; i < p -> lo; i ++)
jpayne@69 2014 if (strchr ("acgt", seq [i]) == NULL)
jpayne@69 2015 n_ct ++;
jpayne@69 2016 len = p -> lo - prev_hi - 1;
jpayne@69 2017 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
jpayne@69 2018 ct, prev_hi + 1, p -> lo - 1, len, n_ct,
jpayne@69 2019 len == 0 ? 0.0 : 100.0 * n_ct / len);
jpayne@69 2020 gap_total += len;
jpayne@69 2021 gap_ns += n_ct;
jpayne@69 2022
jpayne@69 2023 ct ++;
jpayne@69 2024
jpayne@69 2025 n_ct = 0;
jpayne@69 2026 for (i = p -> lo; i <= p -> hi; i ++)
jpayne@69 2027 if (strchr ("acgt", seq [i]) == NULL)
jpayne@69 2028 n_ct ++;
jpayne@69 2029 len = 1 + p -> hi - p -> lo;
jpayne@69 2030 fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
jpayne@69 2031 ct, p -> lo, p -> hi, len, n_ct,
jpayne@69 2032 len == 0 ? 0.0 : 100.0 * n_ct / len);
jpayne@69 2033 cov_total += len;
jpayne@69 2034 cov_ns += n_ct;
jpayne@69 2035
jpayne@69 2036 prev_hi = p -> hi;
jpayne@69 2037 }
jpayne@69 2038
jpayne@69 2039 n_ct = 0;
jpayne@69 2040 seq_len = strlen (seq + 1);
jpayne@69 2041 for (i = prev_hi + 1; i <= seq_len; i ++)
jpayne@69 2042 if (strchr ("acgt", seq [i]) == NULL)
jpayne@69 2043 n_ct ++;
jpayne@69 2044 len = seq_len - prev_hi;
jpayne@69 2045 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
jpayne@69 2046 ct, prev_hi + 1, seq_len, len, n_ct,
jpayne@69 2047 len == 0 ? 0.0 : 100.0 * n_ct / len);
jpayne@69 2048 gap_total += len;
jpayne@69 2049 gap_ns += n_ct;
jpayne@69 2050
jpayne@69 2051 total_ns = cov_ns + gap_ns;
jpayne@69 2052
jpayne@69 2053 fclose (fp);
jpayne@69 2054
jpayne@69 2055 fprintf (stderr, "\n%s Sequence Coverage:\n", tag);
jpayne@69 2056 fprintf (stderr, " Sequence length = %ld\n", seq_len);
jpayne@69 2057 fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns);
jpayne@69 2058 fprintf (stderr, " Non acgt's = %ld\n", total_ns);
jpayne@69 2059 fprintf (stderr, " Number of regions = %d\n", ct);
jpayne@69 2060 fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
jpayne@69 2061 cov_total,
jpayne@69 2062 seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len,
jpayne@69 2063 seq_len - total_ns == 0.0 ? 0.0 :
jpayne@69 2064 100.0 * (cov_total - cov_ns) / (seq_len - total_ns));
jpayne@69 2065 fprintf (stderr, " Avg match len = %.0f\n",
jpayne@69 2066 ct == 0 ? 0.0 : cov_total / ct);
jpayne@69 2067 fprintf (stderr, " N's in matches = %ld (%.1f%%)\n",
jpayne@69 2068 cov_ns,
jpayne@69 2069 cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total);
jpayne@69 2070 fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
jpayne@69 2071 gap_total,
jpayne@69 2072 seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len,
jpayne@69 2073 seq_len - total_ns == 0.0 ? 0.0 :
jpayne@69 2074 100.0 * (gap_total - gap_ns) / (seq_len - total_ns));
jpayne@69 2075 fprintf (stderr, " Avg gap len = %.0f\n",
jpayne@69 2076 gap_total / (1.0 + ct));
jpayne@69 2077 fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n",
jpayne@69 2078 gap_ns,
jpayne@69 2079 gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total);
jpayne@69 2080
jpayne@69 2081 return;
jpayne@69 2082 }
jpayne@69 2083
jpayne@69 2084
jpayne@69 2085
jpayne@69 2086 void Show_Diffs
jpayne@69 2087 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
jpayne@69 2088 long int a_ref, long int b_ref)
jpayne@69 2089
jpayne@69 2090 // Show (to stdout ) the differences
jpayne@69 2091 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
jpayne@69 2092 // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to
jpayne@69 2093 // the start of string a . b_ref is the offset to the start of
jpayne@69 2094 // string b .
jpayne@69 2095
jpayne@69 2096 {
jpayne@69 2097 int i, j, k, m;
jpayne@69 2098
jpayne@69 2099 i = j = 0;
jpayne@69 2100 for (k = 0; k < delta_ct; k ++)
jpayne@69 2101 {
jpayne@69 2102 for (m = 1; m < abs (delta [k]); m ++)
jpayne@69 2103 {
jpayne@69 2104 if (a [i] != b [j])
jpayne@69 2105 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
jpayne@69 2106 i ++;
jpayne@69 2107 j ++;
jpayne@69 2108 }
jpayne@69 2109 if (delta [k] < 0)
jpayne@69 2110 {
jpayne@69 2111 printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]);
jpayne@69 2112 j ++;
jpayne@69 2113 }
jpayne@69 2114 else
jpayne@69 2115 {
jpayne@69 2116 printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]);
jpayne@69 2117 i ++;
jpayne@69 2118 }
jpayne@69 2119 }
jpayne@69 2120 while (i < a_len && j < b_len)
jpayne@69 2121 {
jpayne@69 2122 if (a [i] != b [j])
jpayne@69 2123 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
jpayne@69 2124 i ++;
jpayne@69 2125 j ++;
jpayne@69 2126 }
jpayne@69 2127
jpayne@69 2128 return;
jpayne@69 2129 }
jpayne@69 2130
jpayne@69 2131
jpayne@69 2132
jpayne@69 2133 int Sign
jpayne@69 2134 (int a)
jpayne@69 2135
jpayne@69 2136 // Return the algebraic sign of a .
jpayne@69 2137
jpayne@69 2138 {
jpayne@69 2139 if (a > 0)
jpayne@69 2140 return 1;
jpayne@69 2141 else if (a < 0)
jpayne@69 2142 return -1;
jpayne@69 2143
jpayne@69 2144 return 0;
jpayne@69 2145 }
jpayne@69 2146
jpayne@69 2147
jpayne@69 2148
jpayne@69 2149 void Usage
jpayne@69 2150 (char * command)
jpayne@69 2151
jpayne@69 2152 // Print to stderr description of options and command line for
jpayne@69 2153 // this program. command is the command that was used to
jpayne@69 2154 // invoke it.
jpayne@69 2155
jpayne@69 2156 {
jpayne@69 2157 fprintf (stderr,
jpayne@69 2158 "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n"
jpayne@69 2159 "\n"
jpayne@69 2160 "Combines MUMs in <GapsFile> by extending matches off\n"
jpayne@69 2161 "ends and between MUMs. <RefSequence> is a fasta file\n"
jpayne@69 2162 "of the reference sequence. <MatchSequences> is a\n"
jpayne@69 2163 "multi-fasta file of the sequences matched against the\n"
jpayne@69 2164 "reference\n"
jpayne@69 2165 "\n"
jpayne@69 2166 "Options:\n"
jpayne@69 2167 "-D Only output to stdout the difference positions\n"
jpayne@69 2168 " and characters\n"
jpayne@69 2169 "-n Allow matches only between nucleotides, i.e., ACGTs\n"
jpayne@69 2170 "-N num Break matches at <num> or more consecutive non-ACGTs \n"
jpayne@69 2171 "-q tag Used to label query match\n"
jpayne@69 2172 "-r tag Used to label reference match\n"
jpayne@69 2173 "-S Output all differences in strings\n"
jpayne@69 2174 "-t Label query matches with query fasta header\n"
jpayne@69 2175 "-v num Set verbose level for extra output\n"
jpayne@69 2176 "-W file Reset the default output filename witherrors.gaps\n"
jpayne@69 2177 "-x Don't output .cover files\n"
jpayne@69 2178 "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n",
jpayne@69 2179
jpayne@69 2180 command);
jpayne@69 2181
jpayne@69 2182 return;
jpayne@69 2183 }