Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/combineMUMs.cc @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/combineMUMs.cc Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,2183 @@ +/************************************************* +* Module: CombineMUMs.cc +* Author: Art Delcher +* Description: +* Take clusters of MUMs (from mgaps program) and do alignments +* to combine them into matches. +* +*/ + + +#include "tigrinc.hh" +#include <string> +using namespace std; + + +// Constants + +#define BRANCH_PT_MATCH_VALUE 0.272 + // Value to add for a match in finding branch points + // 1.20 was the calculated value for 6% vs 35% error discrimination + // Converting to integers didn't make it faster +#define BRANCH_PT_ERROR_VALUE -0.728 + // Value to add for a mismatch in finding branch points + // -2.19 was the calculated value for 6% vs 35% error discrimination + // Converting to integers didn't make it faster +#define DEFAULT_ERROR_FILE_NAME "witherrors.gaps" + // Name of file produced that matches the input gaps file + // but with an extra column showing the number of errors in + // each gap +#define DEFAULT_PAD 10 + // Default number of matching bases to show at beginning + // and end of alignments +#define EDIT_DIST_PROB_BOUND 1e-4 + // Probability limit to "band" edit-distance calculation + // Determines NORMAL_DISTRIB_THOLD +#define ERRORS_FOR_FREE 1 + // The number of errors that are ignored in setting probability + // bound for terminating alignment extensions in edit distance + // calculations +#define EXPANSION_FACTOR 1.4 + // Factor by which to grow memory when realloc'ing +#define GIVE_UP_LEN 200 + // Stop alignment when go this many bases past max-score value +#define INITIAL_BUFFER_SIZE 10000 + // Initial number of bytes to use for sequence strings +#define MATCH_SCORE 1.0 + // Score to add for match in finding length of alignment +#define MAX_ERROR_RATE 0.06 + // The largest error allowed in overlaps +#define MAX_FRAG_LEN 1024 + // The longest fragment allowed +#define MAX_ERRORS 500 + // Most errors in any edit distance computation +#define MAX_EXTENSION 10000 + // Maximum extension match out from a match +#define MAX_HDR_LEN 10000 + // Longest allowable fasta header line +#define MAX_MEMORY_STORE 50000 + // The most fragments allowed in memory store +#define MIN_BRANCH_END_DIST 20 + // Branch points must be at least this many bases from the + // end of the fragment to be reported +#define MIN_BRANCH_TAIL_SLOPE 0.20 + // Branch point tails must fall off from the max by at least + // this rate +#define MIN_MATCH_LEN 40 + // Number of bases in match region in order to count it +#define MISMATCH_SCORE -3.0 + // Score to add for non-match in finding length of alignment +#define NORMAL_DISTRIB_THOLD 3.62 + // Determined by EDIT_DIST_PROB_BOUND +#define REALLY_VERBOSE 0 + // If 1 prints tons of stuff +#define VERBOSE 0 + // If 1 prints lots of stuff + + + +// Type definitions + +typedef struct s_Cover_t + { + long int lo, hi; + struct s_Cover_t * next; + } Cover_t; + + + +// Globals + +float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE; + // Score used for matches to locate branch points, i.e., + // where alignment score peaks +float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE; + // Score used for mismatches to locate branch points, i.e., +int Consec_Non_ACGT = 0; + // Stop alignments when encounter at least this many non-acgt characters. +int * Edit_Array [MAX_ERRORS]; + // Use for alignment calculation. Points into Edit_Space . +int Edit_Match_Limit [MAX_ERRORS] = {0}; + // This array [e] is the minimum value of Edit_Array [e] [d] + // to be worth pursuing in edit-distance computations between guides +int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS]; + // Memory used by alignment calculation +int Error_Bound [MAX_FRAG_LEN + 1]; + // This array [i] is the maximum number of errors allowed + // in a match between sequences of length i , which is + // i * MAXERROR_RATE . +char * Error_File_Name = DEFAULT_ERROR_FILE_NAME; + // Name of file to write gaps listing with # errors in each gap +int Fill_Ct = 0; + // Number of non-acgt bases in ref sequence +char * Gaps_File_Path = NULL; + // Name of file produced by mgaps program +char * Match_File_Path = NULL; + // Name of multifasta file of sequences to compare against the reference + // sequence +int UserScoring = FALSE; + // If TRUE, then user specified a percent ID cutoff and scoring + // is adjusted to enforce this in extensions +int Nucleotides_Only = FALSE; + // If TRUE , then only acgt's can match +bool Only_Difference_Positions = false; + // If true , then output just positions of difference instead of + // alignments between exact matches +int Output_Cover_Files = TRUE; + // If TRUE , output files showing coverage of each genome. +double Percent_ID; + // Desired identity rate of matches to be found + // Expressed as a fraction, not really a percent +char * Query = NULL; + // The query sequence +long int Query_Len; + // The length of the query sequence +char * Query_Suffix = "Query"; + // Suffix for query tag +char * Ref = NULL; + // The reference sequence +char * Ref_File_Path = NULL; + // Name of (single) fasta file of reference sequence +long int Ref_Len; + // The length of the reference sequence +long int Ref_Size; + // The size of the reference sequence buffer +char * Ref_Suffix = "Ref"; + // Suffix for reference tag +int Show_Differences = FALSE; + // If TRUE then show differences in all alignments +int Tag_From_Fasta_Line = FALSE; + // If TRUE then use fasta tag from ref & query sequences as + // 1st & 2nd column, resp., on match line to identify matches +int Verbose = 0; + // Controls printing of extra debuggin information + + +bool Global_Debug_Flag = false; + + + +// Functions + +void Add_Coverage + (Cover_t * * list, long int lo, long int hi); +int Binomial_Bound + (int, double, int, double); +void Display_Alignment + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); +void Display_Alignment_With_Pad + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + char * left_pad [3], char * right_pad [3], int pad_len); +void Display_Difference_Positions + (char * a, int a_start, int a_len, char * b, int b_start, int b_len, + int b_incr, int delta [], int delta_ct, const char * a_tag, + const char * b_tag); +int Extend_Backward + (long int * ref_lo, long int * query_lo); +int Extend_Forward + (long int * ref_hi, long int * query_hi); +void Initialize_Globals + (void); +FILE * Local_File_Open + (const char * filename, const char * mode); +int Max_int + (int a, int b); +int Min_int + (int a, int b); +void Parse_Command_Line + (int argc, char * argv []); +int Prefix_Edit_Dist + (char A [], int m, char T [], int n, int Error_Limit, + int * A_End, int * T_End, int * Match_To_End, + int Delta [MAX_ERRORS], int * Delta_Len, int extending); +int Read_String + (FILE * fp, char * * T, long int * Size, char header []); +void Rev_Complement + (char * s); +void Rev_Display_Alignment + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); +int Rev_Prefix_Edit_Dist + (char A [], int m, char T [], int n, int Error_Limit, + int * A_End, int * T_End, int * Match_To_End, + int Delta [MAX_ERRORS], int * Delta_Len, int extending); +void Rev_Show_Diffs + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + long int a_ref, long int b_ref); +void Set_Deltas + (int delta [], int * delta_len, int row, int d, int e); +static void Set_Left_Pad + (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len); +static void Set_Right_Pad + (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len); +void Show_Coverage + (Cover_t * list, char * filename, char * tag, char * seq); +void Show_Diffs + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + long int a_ref, long int b_ref); +int Sign + (int a); +void Usage + (char * command); + + + +int main + (int argc, char * argv []) + + { + FILE * fp, * match_fp, * error_fp; + char ref_header [MAX_HDR_LEN]; + char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN]; + char line [MAX_LINE], filename [MAX_LINE]; + char * left_pad [3], * right_pad [3]; + long int ref_pos, ref_lo, ref_hi; + long int query_pos, query_lo, query_hi, query_size; + long int match_len, prior_match_len = 0, max_len; + double error = 0.0; + long int total_errors = 0; + int match_ct = 0; + double match_total = 0.0; + int is_forward = FALSE; + int first_match = TRUE; + Cover_t * ref_cover_list = NULL; + Cover_t * query_cover_list = NULL; + char * p; + int i; + + Parse_Command_Line (argc, argv); + + if(UserScoring){ + // percentID = mismatch penalty / (match bonus + mismatch penalty) + // or, more compactly, p = mm / (m+mm) + // Since we require that m+mm = 1, m=1-mm + // and so we get the trivial mm = p + // + // This means that Branch_Pt_Error_Value = -p + // and then Branch_Pt_Match_Value = 1-p + // + // Make it so: + Branch_Pt_Error_Value = - Percent_ID; + Branch_Pt_Match_Value = 1.0 - Percent_ID; + } + + Initialize_Globals (); + + p = strrchr (Ref_File_Path, '/'); + if (p == NULL) + strcpy (ref_tag, Ref_File_Path); + else + strcpy (ref_tag, p + 1); + p = strrchr (ref_tag, '.'); + if (p != NULL) + (* p) = '\0'; + strcat (ref_tag, Ref_Suffix); + + p = strrchr (Match_File_Path, '/'); + if (p == NULL) + strcpy (query_tag, Match_File_Path); + else + strcpy (query_tag, p + 1); + p = strrchr (query_tag, '.'); + if (p != NULL) + (* p) = '\0'; + strcat (query_tag, Query_Suffix); + + fp = Local_File_Open (Ref_File_Path, "r"); + Ref_Size = 100000; + Ref = (char *) Safe_malloc (Ref_Size); + + assert (Read_String (fp, & Ref, & Ref_Size, ref_header)); + if (Tag_From_Fasta_Line) + strcpy (ref_tag, strtok (ref_header, " \t\n>")); + fclose (fp); + + Ref_Len = strlen (Ref + 1); + Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len); + for (i = 1; i <= Ref_Len; i ++) + switch (Ref [i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + break; + default : + if (Nucleotides_Only) + Ref [i] = '2'; + Fill_Ct ++; + } + + for (i = 0; i < 3; i ++) + { + left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); + right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); + } + + fp = Local_File_Open (Gaps_File_Path, "r"); + match_fp = Local_File_Open (Match_File_Path, "r"); + query_size = 100000; + Query = (char *) Safe_malloc (query_size); + + error_fp = Local_File_Open (Error_File_Name, "w"); + + while (fgets (line, MAX_LINE, fp) != NULL) + { + int line_len = strlen (line); + + if (line [0] == '>') + { + if (! first_match) + { + total_errors += Extend_Forward (& ref_hi, & query_hi); + max_len = 1 + ref_hi - ref_lo; + if (1 + abs (query_hi - query_lo) > max_len) + max_len = 1 + abs (query_hi - query_lo); + error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); + if (! Only_Difference_Positions) + printf + ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", + ref_lo, ref_hi, + is_forward ? query_lo : 1 + Query_Len - query_lo, + is_forward ? query_hi : 1 + Query_Len - query_hi, + total_errors, max_len, 100.0 * error); + + match_ct ++; + match_total += 1 + ref_hi - ref_lo; + total_errors = 0; + Add_Coverage (& ref_cover_list, ref_lo, ref_hi); + if (is_forward) + Add_Coverage (& query_cover_list, query_lo, query_hi); + else + Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, + 1 + Query_Len - query_lo); + } + if (Tag_From_Fasta_Line) + { + char buff [MAX_LINE]; + char * p; + + strcpy (buff, line + 1); + p = strtok (buff, " >\t\n"); + if (strlen (p) > 0) + strcpy (query_tag, p); + else + { + fprintf (stderr, "No tag on line %s", line); + strcpy (query_tag, "???"); + } + } + is_forward = ! is_forward; + if (is_forward) + { + assert (Read_String (match_fp, & Query, & query_size, header)); + Query_Len = strlen (Query + 1); + if (Nucleotides_Only) + { + for (i = 1; i <= Query_Len; i ++) + switch (Query [i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + break; + default : + Query [i] = '1'; + } + } + } + else + Rev_Complement (Query + 1); + first_match = TRUE; + printf ("%s", line); + fprintf (error_fp, "%s", line); + } + else if (line [0] == '#') + { + if (! first_match) + { + total_errors += Extend_Forward (& ref_hi, & query_hi); + max_len = 1 + ref_hi - ref_lo; + if (1 + abs (query_hi - query_lo) > max_len) + max_len = 1 + abs (query_hi - query_lo); + error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); + if (! Only_Difference_Positions) + printf + ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", + ref_lo, ref_hi, + is_forward ? query_lo : 1 + Query_Len - query_lo, + is_forward ? query_hi : 1 + Query_Len - query_hi, + total_errors, max_len, 100.0 * error); + + match_ct ++; + match_total += 1 + ref_hi - ref_lo; + total_errors = 0; + Add_Coverage (& ref_cover_list, ref_lo, ref_hi); + if (is_forward) + Add_Coverage (& query_cover_list, query_lo, query_hi); + else + Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, + 1 + Query_Len - query_lo); + } + first_match = TRUE; + printf ("%s", line); + fprintf (error_fp, "%s", line); + } + else + { + assert (sscanf (line, "%ld %ld %ld", + & ref_pos, & query_pos, & match_len) == 3); + if (first_match) + { + ref_lo = ref_pos; + query_lo = query_pos; + total_errors += Extend_Backward (& ref_lo, & query_lo); + if (! Only_Difference_Positions) + printf ("%s", line); + if (line_len > 0) + line [line_len - 1] = '\0'; + fprintf (error_fp, "%s %7s\n", line, "-"); + } + else + { + int errors; + int a_len, b_len; + int a_end, b_end, match_to_end; + int delta [MAX_ERRORS], delta_len; + + a_len = ref_pos - ref_hi - 1; + b_len = query_pos - query_hi - 1; + + errors = Prefix_Edit_Dist + (Ref + ref_hi + 1, a_len, + Query + query_hi + 1, b_len, + MAX_ERRORS - 1, & a_end, & b_end, + & match_to_end, delta, & delta_len, FALSE); + if (Show_Differences) + Show_Diffs (Ref + ref_hi + 1, a_end, + Query + query_hi + 1, b_end, + delta, delta_len, ref_hi + 1, query_hi + 1); + + if (! Only_Difference_Positions) + printf ("%s", line); + + Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len, + DEFAULT_PAD); + Set_Right_Pad (right_pad, ref_pos, query_pos, match_len, + DEFAULT_PAD); + + if (a_end == 0 && b_end == 0) + { + if (! Only_Difference_Positions) + printf (" No extension\n"); + + if (line_len > 0) + line [line_len - 1] = '\0'; + fprintf (error_fp, "%s %7s\n", line, "-"); + } + else if (Only_Difference_Positions) + { + int q_start, b_incr; + + if (is_forward) + { + q_start = query_hi + 1; + b_incr = 1; + } + else + { + q_start = Query_Len - query_hi; + b_incr = -1; + } + Display_Difference_Positions + (Ref + ref_hi + 1, ref_hi + 1, a_end, + Query + query_hi + 1, q_start, b_end, + b_incr, delta, delta_len, ref_tag, + query_tag); + } + else + { + printf (" Errors = %d\n", errors); + if (! Only_Difference_Positions) + Display_Alignment_With_Pad + (Ref + ref_hi + 1, a_end, + Query + query_hi + 1, b_end, + delta, delta_len, left_pad, right_pad, + DEFAULT_PAD); + if (line_len > 0) + line [line_len - 1] = '\0'; + fprintf (error_fp, "%s %7d\n", line, errors); + } + + total_errors += errors; + + if (! match_to_end) + { + ref_hi += a_end; + query_hi += b_end; + max_len = 1 + ref_hi - ref_lo; + if (1 + abs (query_hi - query_lo) > max_len) + max_len = 1 + abs (query_hi - query_lo); + error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); + if (! Only_Difference_Positions) + printf ( + "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", + ref_lo, ref_hi, + is_forward ? query_lo : 1 + Query_Len - query_lo, + is_forward ? query_hi : 1 + Query_Len - query_hi, + total_errors, max_len, 100.0 * error); + + match_ct ++; + match_total += 1 + ref_hi - ref_lo; + total_errors = 0; + Add_Coverage (& ref_cover_list, ref_lo, ref_hi); + if (is_forward) + Add_Coverage (& query_cover_list, query_lo, query_hi); + else + Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, + 1 + Query_Len - query_lo); + ref_lo = ref_pos; + query_lo = query_pos; + total_errors += Extend_Backward (& ref_lo, & query_lo); + } + } + + ref_hi = ref_pos + match_len - 1; + query_hi = query_pos + match_len - 1; + prior_match_len = match_len; + first_match = FALSE; + } + } + + if (! first_match) + { + total_errors += Extend_Forward (& ref_hi, & query_hi); + max_len = 1 + ref_hi - ref_lo; + if (1 + abs (query_hi - query_lo) > max_len) + max_len = 1 + abs (query_hi - query_lo); + error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); + if (! Only_Difference_Positions) + printf + ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", + ref_lo, ref_hi, + is_forward ? query_lo : 1 + Query_Len - query_lo, + is_forward ? query_hi : 1 + Query_Len - query_hi, + total_errors, max_len, 100.0 * error); + + match_ct ++; + match_total += 1 + ref_hi - ref_lo; + Add_Coverage (& ref_cover_list, ref_lo, ref_hi); + if (is_forward) + Add_Coverage (& query_cover_list, query_lo, query_hi); + else + Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, + 1 + Query_Len - query_lo); + } + + fprintf (stderr, " Ref len = %ld\n", Ref_Len); + fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct); + fprintf (stderr, " Non acgt's = %d\n", Fill_Ct); + fprintf (stderr, " Number of matches = %d\n", match_ct); + fprintf (stderr, "Sum of match bases = %.0f\n", match_total); + fprintf (stderr, " Avg match bases = %.0f\n", + match_ct == 0 ? 0.0 : match_total / match_ct); + + fclose (error_fp); + + if (Output_Cover_Files) + { + strcpy (filename, ref_tag); + strcat (filename, ".cover"); + Show_Coverage (ref_cover_list, filename, ref_tag, Ref); + + strcpy (filename, query_tag); + strcat (filename, ".cover"); + Rev_Complement (Query + 1); + Show_Coverage (query_cover_list, filename, query_tag, Query); + } + + return 0; + } + + + +void Add_Coverage + (Cover_t * * list, long int lo, long int hi) + +// Add lo .. hi to list of regions covered in (* list) . +// Combine nodes when appropriate. + + { + Cover_t * new_node, * p, * prev = NULL; + + if ((* list) == NULL || hi + 1 < (* list) -> lo) + { + new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); + new_node -> lo = lo; + new_node -> hi = hi; + new_node -> next = (* list); + (* list) = new_node; + return; + } + + for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next) + prev = p; + + if (p == NULL || hi + 1 < p -> lo) + { // insert between or on end + assert (prev != NULL); + new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); + new_node -> lo = lo; + new_node -> hi = hi; + new_node -> next = p; + prev -> next = new_node; + return; + } + + if (lo < p -> lo) + p -> lo = lo; + while (p -> next != NULL && hi + 1 >= p -> next -> lo) + { + Cover_t * save; + + p -> hi = p -> next -> hi; + save = p -> next; + p -> next = save -> next; + free (save); + } + if (hi > p -> hi) + p -> hi = hi; + + return; + } + + + +int Binomial_Bound + (int e, double p, int Start, double Limit) + +// Return the smallest n >= Start s.t. +// prob [>= e errors in n binomial trials (p = error prob)] +// > Limit + + { + double Normal_Z, Mu_Power, Factorial, Poisson_Coeff; + double q, Sum, P_Power, Q_Power, X; + int k, n, Bin_Coeff, Ct; + + q = 1.0 - p; + if (Start < e) + Start = e; + + for (n = Start; n < MAX_FRAG_LEN; n ++) + { + if (n <= 35) + { + Sum = 0.0; + Bin_Coeff = 1; + Ct = 0; + P_Power = 1.0; + Q_Power = pow (q, (double) n); + + for (k = 0; k < e && 1.0 - Sum > Limit; k ++) + { + X = Bin_Coeff * P_Power * Q_Power; + Sum += X; + Bin_Coeff *= n - Ct; + Bin_Coeff /= ++ Ct; + P_Power *= p; + Q_Power /= q; + } + if (1.0 - Sum > Limit) + return n; + } + else + { + Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q)); + if (Normal_Z <= NORMAL_DISTRIB_THOLD) + return n; + Sum = 0.0; + Mu_Power = 1.0; + Factorial = 1.0; + Poisson_Coeff = exp (- (double) n * p); + for (k = 0; k < e; k ++) + { + Sum += Mu_Power * Poisson_Coeff / Factorial; + Mu_Power *= n * p; + Factorial *= k + 1; + } + if (1.0 - Sum > Limit) + return n; + } + } + + return MAX_FRAG_LEN; + } + + + +#define DISPLAY_WIDTH 60 + +void Display_Alignment + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) + +// Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] +// between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . + + { + int i, j, k, m, top_len, bottom_len; + char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; + char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; + + i = j = top_len = bottom_len = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + top [top_len ++] = a [i ++]; + j ++; + } + if (delta [k] < 0) + { + top [top_len ++] = '-'; + j ++; + } + else + { + top [top_len ++] = a [i ++]; + } + } + while (i < a_len && j < b_len) + { + top [top_len ++] = a [i ++]; + j ++; + } + top [top_len] = '\0'; + + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + bottom [bottom_len ++] = b [j ++]; + i ++; + } + if (delta [k] > 0) + { + bottom [bottom_len ++] = '-'; + i ++; + } + else + { + bottom [bottom_len ++] = b [j ++]; + } + } + while (j < b_len && i < a_len) + { + bottom [bottom_len ++] = b [j ++]; + i ++; + } + bottom [bottom_len] = '\0'; + + + for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) + { + printf ("A: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) + putchar (top [i + j]); + putchar ('\n'); + printf ("B: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) + putchar (bottom [i + j]); + putchar ('\n'); + printf (" "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; + j ++) + if (top [i + j] != ' ' && bottom [i + j] != ' ' + && top [i + j] != bottom [i + j]) + putchar ('^'); + else + putchar (' '); + putchar ('\n'); + } + + return; + } + + + +void Display_Alignment_With_Pad + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + char * left_pad [3], char * right_pad [3], int pad_len) + +// Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] +// between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . +// Attach pad strings in left_pad to the beginning of the alignment +// and the strings in right_pad to the end of the alignment. +// pad_len is the width of the pad strings + + { + int i, j, k, m, top_len, bottom_len; +#if 0 + char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; + char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; +#else + string top, bottom; +#endif + + for (k = 0; k < pad_len; k ++) + { + top . push_back (left_pad [0] [k]); + bottom . push_back (left_pad [1] [k]); + } + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + top . push_back (a [i ++]); + j ++; + } + if (delta [k] < 0) + { + top . push_back ('-'); + j ++; + } + else + { + top . push_back (a [i ++]); + } + } + while (i < a_len && j < b_len) + { + top . push_back (a [i ++]); + j ++; + } + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + bottom . push_back (b [j ++]); + i ++; + } + if (delta [k] > 0) + { + bottom . push_back ('-'); + i ++; + } + else + { + bottom . push_back (b [j ++]); + } + } + while (j < b_len && i < a_len) + { + bottom . push_back (b [j ++]); + i ++; + } + + for (k = 0; k < pad_len; k ++) + { + top . push_back (right_pad [0] [k]); + bottom . push_back (right_pad [1] [k]); + } + + top_len = top . length (); + bottom_len = bottom . length (); + + if (top_len != bottom_len) + { + fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n", + top_len, bottom_len); + exit (EXIT_FAILURE); + } + + for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) + { + printf ("A: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) + putchar (top [i + j]); + putchar ('\n'); + printf ("B: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) + putchar (bottom [i + j]); + putchar ('\n'); + printf (" "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; + j ++) + if (i + j < pad_len) + putchar (left_pad [2] [i + j]); + else if (top_len - i - j <= pad_len) + putchar (right_pad [2] [i + j - (top_len - pad_len)]); + else if (top [i + j] != ' ' && bottom [i + j] != ' ' + && top [i + j] != bottom [i + j]) + putchar ('^'); + else + putchar (' '); + putchar ('\n'); + } + + return; + } + + + +void Display_Difference_Positions + (char * a, int a_start, int a_len, char * b, int b_start, int b_len, + int b_incr, int delta [], int delta_ct, const char * a_tag, + const char * b_tag) + +// Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)] +// between strings a [0 .. (a_len - 1)] and +// b [0 .. (b_len - 1))] . The positions of the starts of the strings +// are at a_start and b_start , respectively, and b_incr indicates +// if the b postions increase or decrease. b_incr must be 1 or -1. +// Use a_tag and b_tag to label the positions. + + { + int i, j, k, m; + + assert (b_incr == 1 || b_incr == -1); + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + if (a [i] != b [j]) + printf ("%-15s %-15s %8d %8d %c %c\n", + a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); + i ++; + j ++; + } + if (delta [k] < 0) + { + printf ("%-15s %-15s %8d %8d %c %c\n", + a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]); + j ++; + } + else + { + if (b_incr > 0) + printf ("%-15s %-15s %8d %8d %c %c\n", + a_tag, b_tag, a_start + i, b_start + j, a [i], '-'); + else + printf ("%-15s %-15s %8d %8d %c %c\n", + a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-'); + i ++; + } + } + while (i < a_len && j < b_len) + { + if (a [i] != b [j]) + printf ("%-15s %-15s %8d %8d %c %c\n", + a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); + i ++; + j ++; + } + + return; + } + + + +int Extend_Backward + (long int * ref_lo, long int * query_lo) + +// Do edit distance off end of match beginning at (* ref_lo) and +// (* query_lo) in the reference and query sequences, resp., +// in the reverse direction. +// Return the number of errors in the extension and set (* ref_hi) +// and (* query_hi) to the end of the extension. + + { + int ct = 0, errors, sum = 0; + int a_len, b_len; + int a_end, b_end, match_to_end; + int delta [MAX_ERRORS], delta_len; + + do + { + a_len = (* ref_lo) - 1; + if (a_len > MAX_EXTENSION) + a_len = MAX_EXTENSION; + b_len = (* query_lo) - 1; + if (b_len > MAX_EXTENSION) + b_len = MAX_EXTENSION; + + errors = Rev_Prefix_Edit_Dist + (Ref + (* ref_lo) - 1, a_len, + Query + (* query_lo) - 1, b_len, + MAX_ERRORS - 1, & a_end, & b_end, + & match_to_end, delta, & delta_len, TRUE); + if (Show_Differences) + Rev_Show_Diffs + (Ref + (* ref_lo - 1), a_end, + Query + (* query_lo - 1), b_end, + delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1); + if (Verbose > 0) + { + printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", + ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len, + errors, match_to_end ? 'T' : 'F', a_end, b_end); + Rev_Display_Alignment + (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end, + delta, delta_len); + } + + (* ref_lo) -= a_end; + (* query_lo) -= b_end; + sum += errors; + } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); + + return sum; + } + + + +int Extend_Forward + (long int * ref_hi, long int * query_hi) + +// Do edit distance off end of match ending at (* ref_hi) and +// (* query_hi) in the reference and query sequences, resp. +// Return the number of errors in the extension and set (* ref_hi) +// and (* query_hi) to the end of the extension. + + { + int ct = 0, errors, sum = 0; + int a_end, b_end, match_to_end; + int a_len, b_len; + int delta [MAX_ERRORS], delta_len; + + do + { + a_len = Ref_Len - (* ref_hi); + if (a_len > MAX_EXTENSION) + a_len = MAX_EXTENSION; + b_len = Query_Len - (* query_hi); + if (b_len > MAX_EXTENSION) + b_len = MAX_EXTENSION; + + errors = Prefix_Edit_Dist + (Ref + (* ref_hi) + 1, a_len, + Query + (* query_hi) + 1, b_len, + MAX_ERRORS - 1, & a_end, & b_end, + & match_to_end, delta, & delta_len, TRUE); + if (Show_Differences) + Show_Diffs (Ref + (* ref_hi) + 1, a_end, + Query + (* query_hi) + 1, b_end, + delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1); + if (Verbose > 0) + { + printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", + ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len, + errors, match_to_end ? 'T' : 'F', a_end, b_end); + Display_Alignment (Ref + (* ref_hi) + 1, a_end, + Query + (* query_hi) + 1, b_end, + delta, delta_len); + } + + (* ref_hi) += a_end; + (* query_hi) += b_end; + sum += errors; + } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); + + return sum; + } + + +void Initialize_Globals + (void) + +// Initialize global variables used in this program + + { + int i, offset, del; + int e, start; + + offset = 2; + del = 6; + for (i = 0; i < MAX_ERRORS; i ++) + { + Edit_Array [i] = Edit_Space + offset; + offset += del; + del += 2; + } + + + for (i = 0; i <= ERRORS_FOR_FREE; i ++) + Edit_Match_Limit [i] = 0; + + start = 1; + for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++) + { + start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE, + start, EDIT_DIST_PROB_BOUND); + Edit_Match_Limit [e] = start - 1; + assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]); + } + + for (i = 0; i <= MAX_FRAG_LEN; i ++) + Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE)); + + return; + } + + + +FILE * Local_File_Open + (const char * filename, const char * mode) + +/* Open Filename in Mode and return a pointer to its control +* block. If fail, print a message and exit. */ + + { + FILE * fp; + + fp = fopen (filename, mode); + if (fp == NULL) + { + fprintf (stderr, "ERROR %d: Could not open file %s \n", + errno, filename); + perror (strerror (errno)); + exit (EXIT_FAILURE); + } + + return fp; + } + + + +int Max_int + (int a, int b) + +// Return the larger of a and b . + + { + if (a < b) + return b; + + return a; + } + + + +int Min_int + (int a, int b) + +// Return the smaller of a and b . + + { + if (a < b) + return a; + + return b; + } + + + +void Parse_Command_Line + (int argc, char * argv []) + +// Get options and parameters from command line with argc +// arguments in argv [0 .. (argc - 1)] . + + { + int ch, errflg = FALSE; + + optarg = NULL; + + while (! errflg + && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF)) + switch (ch) + { + case 'D' : + Only_Difference_Positions = true; + break; + + case 'e' : + Percent_ID = 1.0 - atof (optarg); + UserScoring = TRUE; + break; + + case 'n' : + Nucleotides_Only = TRUE; + break; + + case 'N' : + Consec_Non_ACGT = strtol (optarg, NULL, 10); + break; + + case 'q' : + Query_Suffix = optarg; + break; + + case 'r' : + Ref_Suffix = optarg; + break; + + case 'S' : + Show_Differences = TRUE; + break; + + case 't' : + Tag_From_Fasta_Line = TRUE; + break; + + case 'v' : + Verbose = strtol (optarg, NULL, 10); + break; + + case 'x' : + Output_Cover_Files = FALSE; + break; + + case 'W' : + Error_File_Name = optarg; + break; + + case '?' : + fprintf (stderr, "Unrecognized option -%c\n", optopt); + + default : + errflg = TRUE; + } + + if (errflg || optind != argc - 3) + { + Usage (argv [0]); + exit (EXIT_FAILURE); + } + + Ref_File_Path = argv [optind ++]; + + Match_File_Path = argv [optind ++]; + + Gaps_File_Path = argv [optind ++]; + + return; + } + + + +int Prefix_Edit_Dist + (char A [], int m, char T [], int n, int Error_Limit, + int * A_End, int * T_End, int * Match_To_End, + int Delta [MAX_ERRORS], int * Delta_Len, int extending) + +// Return the minimum number of changes (inserts, deletes, replacements) +// needed to match string A [0 .. (m-1)] with a prefix of string +// T [0 .. (n-1)] if it's not more than Error_Limit . +// Put delta description of alignment in Delta and set +// (* Delta_Len) to the number of entries there if it's a complete +// match. +// Set A_End and T_End to the rightmost positions where the +// alignment ended in A and T , respectively. +// Set Match_To_End true if the match extended to the end +// of at least one string; otherwise, set it false to indicate +// a branch point. +// extending indicates whether the match is for a fixed region (FALSE) +// or is extending off the end of a match as far as possible (TRUE) + + { + double Score, Max_Score; + int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; + int Best_d, Best_e, Longest, Row; + int Left, Right; + int d, e, i, j, shorter; + +// assert (m <= n); + Best_d = Best_e = Longest = 0; + (* Delta_Len) = 0; + + if (Consec_Non_ACGT > 0) + { + int ct; + + for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) + { + switch (A [i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + ct = 0; + break; + default : + ct ++; + } + } + if (ct >= Consec_Non_ACGT) + { + m = i - ct; + extending = TRUE; + } + + for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) + { + switch (T [i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + ct = 0; + break; + default : + ct ++; + } + } + if (ct >= Consec_Non_ACGT) + { + n = i - ct; + extending = TRUE; + } + } + + shorter = Min_int (m, n); + for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++) + ; + + Edit_Array [0] [0] = Row; + + if (Row == m && Row == n) // Exact match + { + (* Delta_Len) = 0; + (* A_End) = (* T_End) = Row; + (* Match_To_End) = ! extending; + return 0; + } + + Left = Right = 0; + Max_Score = Row * Branch_Pt_Match_Value; + Max_Score_Len = Row; + Max_Score_Best_d = 0; + Max_Score_Best_e = 0; + + for (e = 1; e <= Error_Limit; e ++) + { + int cutoff; + + Left = Max_int (Left - 1, -e); + Right = Min_int (Right + 1, e); + Edit_Array [e - 1] [Left] = -2; + Edit_Array [e - 1] [Left - 1] = -2; + Edit_Array [e - 1] [Right] = -2; + Edit_Array [e - 1] [Right + 1] = -2; + + for (d = Left; d <= Right; d ++) + { + Row = 1 + Edit_Array [e - 1] [d]; + if ((j = Edit_Array [e - 1] [d - 1]) > Row) + Row = j; + if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) + Row = j; + while (Row < m && Row + d < n + && A [Row] == T [Row + d]) + Row ++; + + Edit_Array [e] [d] = Row; + + if (Row == m && Row + d == n) + { + if (extending) + { + for (j = Left; j <= d; j ++) + if (Edit_Array [e] [j] > Longest) + { + Best_d = j; + Longest = Edit_Array [e] [j]; + } + Score = Longest * Branch_Pt_Match_Value - e; + if (Score < Max_Score) + { + (* A_End) = Max_Score_Len; + (* T_End) = Max_Score_Len + Max_Score_Best_d; + Set_Deltas (Delta, Delta_Len, Max_Score_Len, + Max_Score_Best_d, Max_Score_Best_e); + (* Match_To_End) = FALSE; + return Max_Score_Best_e; + } + } + + (* A_End) = Row; // One past last align position + (* T_End) = Row + d; + Set_Deltas (Delta, Delta_Len, Row, d, e); + (* Match_To_End) = ! extending; + return e; + } + } + + cutoff = Longest - GIVE_UP_LEN; + while (Left <= Right && Edit_Array [e] [Left] < cutoff) + Left ++; + if (Left > Right) + break; + while (Right > Left && Edit_Array [e] [Right] < cutoff) + Right --; + + assert (Left <= Right); + + for (d = Left; d <= Right; d ++) + if (Edit_Array [e] [d] > Longest) + { + Best_d = d; + Best_e = e; + Longest = Edit_Array [e] [d]; + } + + Score = Longest * Branch_Pt_Match_Value - e; + // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 + if (Score > Max_Score) + { + Max_Score = Score; + Max_Score_Len = Longest; + Max_Score_Best_d = Best_d; + Max_Score_Best_e = Best_e; + } + + if (Longest - Max_Score_Len >= GIVE_UP_LEN) + break; + } + + (* A_End) = Max_Score_Len; + (* T_End) = Max_Score_Len + Max_Score_Best_d; + Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); + (* Match_To_End) = FALSE; + + return Max_Score_Best_e; + } + + + +int Read_String + (FILE * fp, char * * T, long int * Size, char header []) + +/* Read next string from fp (assuming FASTA format) into (* T) [1 ..] +* which has Size characters. Allocate extra memory if needed +* and adjust Size accordingly. Return TRUE if successful, FALSE +* otherwise (e.g., EOF). Set header to the contents of the FASTA +* header line. */ + + { + char Line [MAX_LINE]; + long int Len; + int Ch, Ct; + + if (feof (fp)) + return FALSE; + + while ((Ch = fgetc (fp)) != EOF && Ch != '>') + ; + + if (Ch != '>') + return FALSE; + + fgets (Line, MAX_LINE, fp); + Len = strlen (Line); + assert (Len > 0 && Line [Len - 1] == '\n'); + Line [Len - 1] = '\0'; + strcpy (header, Line); + + + Ct = 0; + Len = 0; + while ((Ch = fgetc (fp)) != EOF && Ch != '>') + { + if (isspace (Ch)) + continue; + + Ct ++; + + if (Ct >= (* Size) - 1) + { + (* Size) = (long int) ((* Size) * EXPANSION_FACTOR); + (* T) = (char *) Safe_realloc ((* T), (* Size)); + } + Ch = tolower (Ch); + if (! isalpha (Ch)) + { + fprintf (stderr, "Unexpected character `%c\' in string %s\n", + Ch, header); + Ch = 'x'; + } + (* T) [++ Len] = Ch; + } + + (* T) [Len + 1] = '\0'; + if (Ch == '>') + ungetc (Ch, fp); + + return TRUE; + } + + + +void Rev_Complement + (char * s) + +/* Set string s to its DNA reverse complement. */ + + { + char ch; + int i, j, len; + + len = strlen (s); + + for (i = 0, j = len - 1; i < j; i ++, j --) + { + ch = Complement (s [i]); + s [i] = Complement (s [j]); + s [j] = ch; + } + + if (i == j) + s [i] = Complement (s [i]); + + return; + } + + + +void Rev_Display_Alignment + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) + +// Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] +// between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] +// in the reverse direction. + + { + int i, j, k, m, top_len, bottom_len; + char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; + char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; + + i = j = top_len = bottom_len = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + top [top_len ++] = a [- i ++]; + j ++; + } + if (delta [k] < 0) + { + top [top_len ++] = '-'; + j ++; + } + else + { + top [top_len ++] = a [- i ++]; + } + } + while (i < a_len && j < b_len) + { + top [top_len ++] = a [- i ++]; + j ++; + } + top [top_len] = '\0'; + + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + bottom [bottom_len ++] = b [- j ++]; + i ++; + } + if (delta [k] > 0) + { + bottom [bottom_len ++] = '-'; + i ++; + } + else + { + bottom [bottom_len ++] = b [- j ++]; + } + } + while (j < b_len && i < a_len) + { + bottom [bottom_len ++] = b [- j ++]; + i ++; + } + bottom [bottom_len] = '\0'; + + + for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) + { + putchar ('\n'); + printf ("A: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) + putchar (top [i + j]); + putchar ('\n'); + printf ("B: "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) + putchar (bottom [i + j]); + putchar ('\n'); + printf (" "); + for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; + j ++) + if (top [i + j] != ' ' && bottom [i + j] != ' ' + && top [i + j] != bottom [i + j]) + putchar ('^'); + else + putchar (' '); + putchar ('\n'); + } + + return; + } + + + +int Rev_Prefix_Edit_Dist + (char A [], int m, char T [], int n, int Error_Limit, + int * A_End, int * T_End, int * Match_To_End, + int Delta [MAX_ERRORS], int * Delta_Len, int extending) + +// Return the minimum number of changes (inserts, deletes, replacements) +// needed to match string A [0 .. -(m-1)] with a prefix of string +// T [0 .. -(n-1)] if it's not more than Error_Limit . +// Note: Match is reverse direction, right to left. +// Put delta description of alignment in Delta and set +// (* Delta_Len) to the number of entries there if it's a complete +// match. +// Set A_End and T_End to the rightmost positions where the +// alignment ended in A and T , respectively. +// Set Match_To_End true if the match extended to the end +// of at least one string; otherwise, set it false to indicate +// a branch point. +// extending indicates whether the match is for a fixed region (FALSE) +// or is extending off the end of a match as far as possible (TRUE) + + { + double Score, Max_Score; + int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; + int Best_d, Best_e, Longest, Row; + int Left, Right; + int d, e, i, j, shorter; + +// assert (m <= n); + Best_d = Best_e = Longest = 0; + (* Delta_Len) = 0; + + if (Consec_Non_ACGT > 0) + { + int ct; + + for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) + { + switch (A [- i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + ct = 0; + break; + default : + ct ++; + } + } + if (ct >= Consec_Non_ACGT) + { + m = i - ct; + extending = TRUE; + } + + for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) + { + switch (T [- i]) + { + case 'a' : + case 'c' : + case 'g' : + case 't' : + ct = 0; + break; + default : + ct ++; + } + } + if (ct >= Consec_Non_ACGT) + { + n = i - ct; + extending = TRUE; + } + } + + shorter = Min_int (m, n); + for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++) + ; + + Edit_Array [0] [0] = Row; + + if (Row == m && Row == n) // Exact match + { + (* Delta_Len) = 0; + (* A_End) = (* T_End) = Row; + (* Match_To_End) = ! extending; + return 0; + } + + Left = Right = 0; + Max_Score = Row * Branch_Pt_Match_Value; + Max_Score_Len = Row; + Max_Score_Best_d = 0; + Max_Score_Best_e = 0; + + for (e = 1; e <= Error_Limit; e ++) + { + int cutoff; + + Left = Max_int (Left - 1, -e); + Right = Min_int (Right + 1, e); + Edit_Array [e - 1] [Left] = -2; + Edit_Array [e - 1] [Left - 1] = -2; + Edit_Array [e - 1] [Right] = -2; + Edit_Array [e - 1] [Right + 1] = -2; + + for (d = Left; d <= Right; d ++) + { + Row = 1 + Edit_Array [e - 1] [d]; + if ((j = Edit_Array [e - 1] [d - 1]) > Row) + Row = j; + if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) + Row = j; + while (Row < m && Row + d < n + && A [- Row] == T [- Row - d]) + Row ++; + + Edit_Array [e] [d] = Row; + + if (Row == m && Row + d == n) + { + if (extending) + { + for (j = Left; j <= d; j ++) + if (Edit_Array [e] [j] > Longest) + { + Best_d = j; + Longest = Edit_Array [e] [j]; + } + Score = Longest * Branch_Pt_Match_Value - e; + if (Score < Max_Score) + { + (* A_End) = Max_Score_Len; + (* T_End) = Max_Score_Len + Max_Score_Best_d; + Set_Deltas (Delta, Delta_Len, Max_Score_Len, + Max_Score_Best_d, Max_Score_Best_e); + (* Match_To_End) = FALSE; + return Max_Score_Best_e; + } + } + + (* A_End) = Row; // One past last align position + (* T_End) = Row + d; + Set_Deltas (Delta, Delta_Len, Row, d, e); + (* Match_To_End) = ! extending; + return e; + } + } + + cutoff = Longest - GIVE_UP_LEN; + while (Left <= Right && Edit_Array [e] [Left] < cutoff) + Left ++; + if (Left > Right) + break; + while (Right > Left && Edit_Array [e] [Right] < cutoff) + Right --; + assert (Left <= Right); + + for (d = Left; d <= Right; d ++) + if (Edit_Array [e] [d] > Longest) + { + Best_d = d; + Best_e = e; + Longest = Edit_Array [e] [d]; + } + + Score = Longest * Branch_Pt_Match_Value - e; + // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 + if (Score > Max_Score) + { + Max_Score = Score; + Max_Score_Len = Longest; + Max_Score_Best_d = Best_d; + Max_Score_Best_e = Best_e; + } + + + if (Longest - Max_Score_Len >= GIVE_UP_LEN) + break; + } + + (* A_End) = Max_Score_Len; + (* T_End) = Max_Score_Len + Max_Score_Best_d; + Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); + (* Match_To_End) = FALSE; + + return Max_Score_Best_e; + } + + + +void Rev_Show_Diffs + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + long int a_ref, long int b_ref) + +// Show (to stdout ) the differences +// between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] +// encoded in delta [0 .. (delta_ct - 1)] +// in the reverse direction. a_ref is the position of the beginning +// of string a . b_ref is the offset to the start of +// string b . + + { + int i, j, k, m; + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + if (a [- i] != b [- j]) + printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); + i ++; + j ++; + } + if (delta [k] < 0) + { + printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]); + j ++; + } + else + { + printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]); + i ++; + } + } + while (i < a_len && j < b_len) + { + if (a [- i] != b [- j]) + printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); + i ++; + j ++; + } + + return; + } + + + +void Set_Deltas + (int delta [], int * delta_len, int row, int d, int e) + +// Set delta to values that show the alignment recorded in +// global Edit_Array . Set (* delta_len) to the number of +// entries set. row is the length of the match in the first +// string. d is the offset at the end of the match between +// the first and second string. e is the number of errors. + + { + int delta_stack [MAX_ERRORS]; + int from, last, max; + int i, j, k; + + last = row; + (* delta_len) = 0; + + for (k = e; k > 0; k --) + { + from = d; + max = 1 + Edit_Array [k - 1] [d]; + if ((j = Edit_Array [k - 1] [d - 1]) > max) + { + from = d - 1; + max = j; + } + if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max) + { + from = d + 1; + max = j; + } + if (from == d - 1) + { + delta_stack [(* delta_len) ++] = max - last - 1; + d --; + last = Edit_Array [k - 1] [from]; + } + else if (from == d + 1) + { + delta_stack [(* delta_len) ++] = last - (max - 1); + d ++; + last = Edit_Array [k - 1] [from]; + } + } + delta_stack [(* delta_len) ++] = last + 1; + + k = 0; + for (i = (* delta_len) - 1; i > 0; i --) + delta [k ++] + = abs (delta_stack [i]) * Sign (delta_stack [i - 1]); + (* delta_len) --; + + return; + } + + + +static void Set_Left_Pad + (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len) + +// Set pad to a triple of strings, each of length pad_len , +// representing a MUM of length mum_len ending at r_hi in Ref +// and q_hi in Query . Each string in pad must have been allocated +// at least pad_len + 1 bytes of memory. The purpose is to serve as +// the left-end padding for an alignment with this MUM as its left +// boundary. + + { + int i; + + for (i = 0; i < 3; i ++) + pad [i] [pad_len] = '\0'; + + for (i = 0; i < pad_len; i ++) + { + pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.'; + pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.'; + pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' '; + } + + return; + } + + + +static void Set_Right_Pad + (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len) + +// Set pad to a triple of strings, each of length pad_len , +// representing a MUM of length mum_len beginning at r_lo in Ref +// and q_lo in Query . Each string in pad must have been allocated +// at least pad_len + 1 bytes of memory. The purpose is to serve as +// the right-end padding for an alignment with this MUM as its right +// boundary. + + { + int i; + + for (i = 0; i < 3; i ++) + pad [i] [pad_len] = '\0'; + + for (i = 0; i < pad_len; i ++) + { + pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.'; + pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.'; + pad [2] [i] = i < mum_len ? '=' : ' '; + } + + return; + } + + + +void Show_Coverage + (Cover_t * list, char * filename, char * tag, char * seq) + +// Print to stderr summary stats of regions in list . +// Send to filename a list of region info suitable for celagram . +// Use tag to print the headings. Regions refer to seq . + + { + FILE * fp; + Cover_t * p; + int ct = 0; + long int i, prev_hi = 0, n_ct, len, seq_len; + long int cov_ns = 0, gap_ns = 0, total_ns; + double cov_total = 0.0, gap_total = 0.0; + + fp = Local_File_Open (filename, "w"); + fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n", + "Region", "Start", "End", "Len", "N's", "%N's"); + + for (p = list; p != NULL; p = p -> next) + { + n_ct = 0; + for (i = prev_hi + 1; i < p -> lo; i ++) + if (strchr ("acgt", seq [i]) == NULL) + n_ct ++; + len = p -> lo - prev_hi - 1; + fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", + ct, prev_hi + 1, p -> lo - 1, len, n_ct, + len == 0 ? 0.0 : 100.0 * n_ct / len); + gap_total += len; + gap_ns += n_ct; + + ct ++; + + n_ct = 0; + for (i = p -> lo; i <= p -> hi; i ++) + if (strchr ("acgt", seq [i]) == NULL) + n_ct ++; + len = 1 + p -> hi - p -> lo; + fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", + ct, p -> lo, p -> hi, len, n_ct, + len == 0 ? 0.0 : 100.0 * n_ct / len); + cov_total += len; + cov_ns += n_ct; + + prev_hi = p -> hi; + } + + n_ct = 0; + seq_len = strlen (seq + 1); + for (i = prev_hi + 1; i <= seq_len; i ++) + if (strchr ("acgt", seq [i]) == NULL) + n_ct ++; + len = seq_len - prev_hi; + fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", + ct, prev_hi + 1, seq_len, len, n_ct, + len == 0 ? 0.0 : 100.0 * n_ct / len); + gap_total += len; + gap_ns += n_ct; + + total_ns = cov_ns + gap_ns; + + fclose (fp); + + fprintf (stderr, "\n%s Sequence Coverage:\n", tag); + fprintf (stderr, " Sequence length = %ld\n", seq_len); + fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns); + fprintf (stderr, " Non acgt's = %ld\n", total_ns); + fprintf (stderr, " Number of regions = %d\n", ct); + fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", + cov_total, + seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len, + seq_len - total_ns == 0.0 ? 0.0 : + 100.0 * (cov_total - cov_ns) / (seq_len - total_ns)); + fprintf (stderr, " Avg match len = %.0f\n", + ct == 0 ? 0.0 : cov_total / ct); + fprintf (stderr, " N's in matches = %ld (%.1f%%)\n", + cov_ns, + cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total); + fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", + gap_total, + seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len, + seq_len - total_ns == 0.0 ? 0.0 : + 100.0 * (gap_total - gap_ns) / (seq_len - total_ns)); + fprintf (stderr, " Avg gap len = %.0f\n", + gap_total / (1.0 + ct)); + fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n", + gap_ns, + gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total); + + return; + } + + + +void Show_Diffs + (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, + long int a_ref, long int b_ref) + +// Show (to stdout ) the differences +// between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] +// encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to +// the start of string a . b_ref is the offset to the start of +// string b . + + { + int i, j, k, m; + + i = j = 0; + for (k = 0; k < delta_ct; k ++) + { + for (m = 1; m < abs (delta [k]); m ++) + { + if (a [i] != b [j]) + printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); + i ++; + j ++; + } + if (delta [k] < 0) + { + printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]); + j ++; + } + else + { + printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]); + i ++; + } + } + while (i < a_len && j < b_len) + { + if (a [i] != b [j]) + printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); + i ++; + j ++; + } + + return; + } + + + +int Sign + (int a) + +// Return the algebraic sign of a . + + { + if (a > 0) + return 1; + else if (a < 0) + return -1; + + return 0; + } + + + +void Usage + (char * command) + +// Print to stderr description of options and command line for +// this program. command is the command that was used to +// invoke it. + + { + fprintf (stderr, + "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n" + "\n" + "Combines MUMs in <GapsFile> by extending matches off\n" + "ends and between MUMs. <RefSequence> is a fasta file\n" + "of the reference sequence. <MatchSequences> is a\n" + "multi-fasta file of the sequences matched against the\n" + "reference\n" + "\n" + "Options:\n" + "-D Only output to stdout the difference positions\n" + " and characters\n" + "-n Allow matches only between nucleotides, i.e., ACGTs\n" + "-N num Break matches at <num> or more consecutive non-ACGTs \n" + "-q tag Used to label query match\n" + "-r tag Used to label reference match\n" + "-S Output all differences in strings\n" + "-t Label query matches with query fasta header\n" + "-v num Set verbose level for extra output\n" + "-W file Reset the default output filename witherrors.gaps\n" + "-x Don't output .cover files\n" + "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n", + + command); + + return; + }