jpayne@69: /************************************************* jpayne@69: * Module: CombineMUMs.cc jpayne@69: * Author: Art Delcher jpayne@69: * Description: jpayne@69: * Take clusters of MUMs (from mgaps program) and do alignments jpayne@69: * to combine them into matches. jpayne@69: * jpayne@69: */ jpayne@69: jpayne@69: jpayne@69: #include "tigrinc.hh" jpayne@69: #include jpayne@69: using namespace std; jpayne@69: jpayne@69: jpayne@69: // Constants jpayne@69: jpayne@69: #define BRANCH_PT_MATCH_VALUE 0.272 jpayne@69: // Value to add for a match in finding branch points jpayne@69: // 1.20 was the calculated value for 6% vs 35% error discrimination jpayne@69: // Converting to integers didn't make it faster jpayne@69: #define BRANCH_PT_ERROR_VALUE -0.728 jpayne@69: // Value to add for a mismatch in finding branch points jpayne@69: // -2.19 was the calculated value for 6% vs 35% error discrimination jpayne@69: // Converting to integers didn't make it faster jpayne@69: #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps" jpayne@69: // Name of file produced that matches the input gaps file jpayne@69: // but with an extra column showing the number of errors in jpayne@69: // each gap jpayne@69: #define DEFAULT_PAD 10 jpayne@69: // Default number of matching bases to show at beginning jpayne@69: // and end of alignments jpayne@69: #define EDIT_DIST_PROB_BOUND 1e-4 jpayne@69: // Probability limit to "band" edit-distance calculation jpayne@69: // Determines NORMAL_DISTRIB_THOLD jpayne@69: #define ERRORS_FOR_FREE 1 jpayne@69: // The number of errors that are ignored in setting probability jpayne@69: // bound for terminating alignment extensions in edit distance jpayne@69: // calculations jpayne@69: #define EXPANSION_FACTOR 1.4 jpayne@69: // Factor by which to grow memory when realloc'ing jpayne@69: #define GIVE_UP_LEN 200 jpayne@69: // Stop alignment when go this many bases past max-score value jpayne@69: #define INITIAL_BUFFER_SIZE 10000 jpayne@69: // Initial number of bytes to use for sequence strings jpayne@69: #define MATCH_SCORE 1.0 jpayne@69: // Score to add for match in finding length of alignment jpayne@69: #define MAX_ERROR_RATE 0.06 jpayne@69: // The largest error allowed in overlaps jpayne@69: #define MAX_FRAG_LEN 1024 jpayne@69: // The longest fragment allowed jpayne@69: #define MAX_ERRORS 500 jpayne@69: // Most errors in any edit distance computation jpayne@69: #define MAX_EXTENSION 10000 jpayne@69: // Maximum extension match out from a match jpayne@69: #define MAX_HDR_LEN 10000 jpayne@69: // Longest allowable fasta header line jpayne@69: #define MAX_MEMORY_STORE 50000 jpayne@69: // The most fragments allowed in memory store jpayne@69: #define MIN_BRANCH_END_DIST 20 jpayne@69: // Branch points must be at least this many bases from the jpayne@69: // end of the fragment to be reported jpayne@69: #define MIN_BRANCH_TAIL_SLOPE 0.20 jpayne@69: // Branch point tails must fall off from the max by at least jpayne@69: // this rate jpayne@69: #define MIN_MATCH_LEN 40 jpayne@69: // Number of bases in match region in order to count it jpayne@69: #define MISMATCH_SCORE -3.0 jpayne@69: // Score to add for non-match in finding length of alignment jpayne@69: #define NORMAL_DISTRIB_THOLD 3.62 jpayne@69: // Determined by EDIT_DIST_PROB_BOUND jpayne@69: #define REALLY_VERBOSE 0 jpayne@69: // If 1 prints tons of stuff jpayne@69: #define VERBOSE 0 jpayne@69: // If 1 prints lots of stuff jpayne@69: jpayne@69: jpayne@69: jpayne@69: // Type definitions jpayne@69: jpayne@69: typedef struct s_Cover_t jpayne@69: { jpayne@69: long int lo, hi; jpayne@69: struct s_Cover_t * next; jpayne@69: } Cover_t; jpayne@69: jpayne@69: jpayne@69: jpayne@69: // Globals jpayne@69: jpayne@69: float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE; jpayne@69: // Score used for matches to locate branch points, i.e., jpayne@69: // where alignment score peaks jpayne@69: float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE; jpayne@69: // Score used for mismatches to locate branch points, i.e., jpayne@69: int Consec_Non_ACGT = 0; jpayne@69: // Stop alignments when encounter at least this many non-acgt characters. jpayne@69: int * Edit_Array [MAX_ERRORS]; jpayne@69: // Use for alignment calculation. Points into Edit_Space . jpayne@69: int Edit_Match_Limit [MAX_ERRORS] = {0}; jpayne@69: // This array [e] is the minimum value of Edit_Array [e] [d] jpayne@69: // to be worth pursuing in edit-distance computations between guides jpayne@69: int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS]; jpayne@69: // Memory used by alignment calculation jpayne@69: int Error_Bound [MAX_FRAG_LEN + 1]; jpayne@69: // This array [i] is the maximum number of errors allowed jpayne@69: // in a match between sequences of length i , which is jpayne@69: // i * MAXERROR_RATE . jpayne@69: char * Error_File_Name = DEFAULT_ERROR_FILE_NAME; jpayne@69: // Name of file to write gaps listing with # errors in each gap jpayne@69: int Fill_Ct = 0; jpayne@69: // Number of non-acgt bases in ref sequence jpayne@69: char * Gaps_File_Path = NULL; jpayne@69: // Name of file produced by mgaps program jpayne@69: char * Match_File_Path = NULL; jpayne@69: // Name of multifasta file of sequences to compare against the reference jpayne@69: // sequence jpayne@69: int UserScoring = FALSE; jpayne@69: // If TRUE, then user specified a percent ID cutoff and scoring jpayne@69: // is adjusted to enforce this in extensions jpayne@69: int Nucleotides_Only = FALSE; jpayne@69: // If TRUE , then only acgt's can match jpayne@69: bool Only_Difference_Positions = false; jpayne@69: // If true , then output just positions of difference instead of jpayne@69: // alignments between exact matches jpayne@69: int Output_Cover_Files = TRUE; jpayne@69: // If TRUE , output files showing coverage of each genome. jpayne@69: double Percent_ID; jpayne@69: // Desired identity rate of matches to be found jpayne@69: // Expressed as a fraction, not really a percent jpayne@69: char * Query = NULL; jpayne@69: // The query sequence jpayne@69: long int Query_Len; jpayne@69: // The length of the query sequence jpayne@69: char * Query_Suffix = "Query"; jpayne@69: // Suffix for query tag jpayne@69: char * Ref = NULL; jpayne@69: // The reference sequence jpayne@69: char * Ref_File_Path = NULL; jpayne@69: // Name of (single) fasta file of reference sequence jpayne@69: long int Ref_Len; jpayne@69: // The length of the reference sequence jpayne@69: long int Ref_Size; jpayne@69: // The size of the reference sequence buffer jpayne@69: char * Ref_Suffix = "Ref"; jpayne@69: // Suffix for reference tag jpayne@69: int Show_Differences = FALSE; jpayne@69: // If TRUE then show differences in all alignments jpayne@69: int Tag_From_Fasta_Line = FALSE; jpayne@69: // If TRUE then use fasta tag from ref & query sequences as jpayne@69: // 1st & 2nd column, resp., on match line to identify matches jpayne@69: int Verbose = 0; jpayne@69: // Controls printing of extra debuggin information jpayne@69: jpayne@69: jpayne@69: bool Global_Debug_Flag = false; jpayne@69: jpayne@69: jpayne@69: jpayne@69: // Functions jpayne@69: jpayne@69: void Add_Coverage jpayne@69: (Cover_t * * list, long int lo, long int hi); jpayne@69: int Binomial_Bound jpayne@69: (int, double, int, double); jpayne@69: void Display_Alignment jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); jpayne@69: void Display_Alignment_With_Pad jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: char * left_pad [3], char * right_pad [3], int pad_len); jpayne@69: void Display_Difference_Positions jpayne@69: (char * a, int a_start, int a_len, char * b, int b_start, int b_len, jpayne@69: int b_incr, int delta [], int delta_ct, const char * a_tag, jpayne@69: const char * b_tag); jpayne@69: int Extend_Backward jpayne@69: (long int * ref_lo, long int * query_lo); jpayne@69: int Extend_Forward jpayne@69: (long int * ref_hi, long int * query_hi); jpayne@69: void Initialize_Globals jpayne@69: (void); jpayne@69: FILE * Local_File_Open jpayne@69: (const char * filename, const char * mode); jpayne@69: int Max_int jpayne@69: (int a, int b); jpayne@69: int Min_int jpayne@69: (int a, int b); jpayne@69: void Parse_Command_Line jpayne@69: (int argc, char * argv []); jpayne@69: int Prefix_Edit_Dist jpayne@69: (char A [], int m, char T [], int n, int Error_Limit, jpayne@69: int * A_End, int * T_End, int * Match_To_End, jpayne@69: int Delta [MAX_ERRORS], int * Delta_Len, int extending); jpayne@69: int Read_String jpayne@69: (FILE * fp, char * * T, long int * Size, char header []); jpayne@69: void Rev_Complement jpayne@69: (char * s); jpayne@69: void Rev_Display_Alignment jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct); jpayne@69: int Rev_Prefix_Edit_Dist jpayne@69: (char A [], int m, char T [], int n, int Error_Limit, jpayne@69: int * A_End, int * T_End, int * Match_To_End, jpayne@69: int Delta [MAX_ERRORS], int * Delta_Len, int extending); jpayne@69: void Rev_Show_Diffs jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: long int a_ref, long int b_ref); jpayne@69: void Set_Deltas jpayne@69: (int delta [], int * delta_len, int row, int d, int e); jpayne@69: static void Set_Left_Pad jpayne@69: (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len); jpayne@69: static void Set_Right_Pad jpayne@69: (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len); jpayne@69: void Show_Coverage jpayne@69: (Cover_t * list, char * filename, char * tag, char * seq); jpayne@69: void Show_Diffs jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: long int a_ref, long int b_ref); jpayne@69: int Sign jpayne@69: (int a); jpayne@69: void Usage jpayne@69: (char * command); jpayne@69: jpayne@69: jpayne@69: jpayne@69: int main jpayne@69: (int argc, char * argv []) jpayne@69: jpayne@69: { jpayne@69: FILE * fp, * match_fp, * error_fp; jpayne@69: char ref_header [MAX_HDR_LEN]; jpayne@69: char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN]; jpayne@69: char line [MAX_LINE], filename [MAX_LINE]; jpayne@69: char * left_pad [3], * right_pad [3]; jpayne@69: long int ref_pos, ref_lo, ref_hi; jpayne@69: long int query_pos, query_lo, query_hi, query_size; jpayne@69: long int match_len, prior_match_len = 0, max_len; jpayne@69: double error = 0.0; jpayne@69: long int total_errors = 0; jpayne@69: int match_ct = 0; jpayne@69: double match_total = 0.0; jpayne@69: int is_forward = FALSE; jpayne@69: int first_match = TRUE; jpayne@69: Cover_t * ref_cover_list = NULL; jpayne@69: Cover_t * query_cover_list = NULL; jpayne@69: char * p; jpayne@69: int i; jpayne@69: jpayne@69: Parse_Command_Line (argc, argv); jpayne@69: jpayne@69: if(UserScoring){ jpayne@69: // percentID = mismatch penalty / (match bonus + mismatch penalty) jpayne@69: // or, more compactly, p = mm / (m+mm) jpayne@69: // Since we require that m+mm = 1, m=1-mm jpayne@69: // and so we get the trivial mm = p jpayne@69: // jpayne@69: // This means that Branch_Pt_Error_Value = -p jpayne@69: // and then Branch_Pt_Match_Value = 1-p jpayne@69: // jpayne@69: // Make it so: jpayne@69: Branch_Pt_Error_Value = - Percent_ID; jpayne@69: Branch_Pt_Match_Value = 1.0 - Percent_ID; jpayne@69: } jpayne@69: jpayne@69: Initialize_Globals (); jpayne@69: jpayne@69: p = strrchr (Ref_File_Path, '/'); jpayne@69: if (p == NULL) jpayne@69: strcpy (ref_tag, Ref_File_Path); jpayne@69: else jpayne@69: strcpy (ref_tag, p + 1); jpayne@69: p = strrchr (ref_tag, '.'); jpayne@69: if (p != NULL) jpayne@69: (* p) = '\0'; jpayne@69: strcat (ref_tag, Ref_Suffix); jpayne@69: jpayne@69: p = strrchr (Match_File_Path, '/'); jpayne@69: if (p == NULL) jpayne@69: strcpy (query_tag, Match_File_Path); jpayne@69: else jpayne@69: strcpy (query_tag, p + 1); jpayne@69: p = strrchr (query_tag, '.'); jpayne@69: if (p != NULL) jpayne@69: (* p) = '\0'; jpayne@69: strcat (query_tag, Query_Suffix); jpayne@69: jpayne@69: fp = Local_File_Open (Ref_File_Path, "r"); jpayne@69: Ref_Size = 100000; jpayne@69: Ref = (char *) Safe_malloc (Ref_Size); jpayne@69: jpayne@69: assert (Read_String (fp, & Ref, & Ref_Size, ref_header)); jpayne@69: if (Tag_From_Fasta_Line) jpayne@69: strcpy (ref_tag, strtok (ref_header, " \t\n>")); jpayne@69: fclose (fp); jpayne@69: jpayne@69: Ref_Len = strlen (Ref + 1); jpayne@69: Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len); jpayne@69: for (i = 1; i <= Ref_Len; i ++) jpayne@69: switch (Ref [i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: break; jpayne@69: default : jpayne@69: if (Nucleotides_Only) jpayne@69: Ref [i] = '2'; jpayne@69: Fill_Ct ++; jpayne@69: } jpayne@69: jpayne@69: for (i = 0; i < 3; i ++) jpayne@69: { jpayne@69: left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); jpayne@69: right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1); jpayne@69: } jpayne@69: jpayne@69: fp = Local_File_Open (Gaps_File_Path, "r"); jpayne@69: match_fp = Local_File_Open (Match_File_Path, "r"); jpayne@69: query_size = 100000; jpayne@69: Query = (char *) Safe_malloc (query_size); jpayne@69: jpayne@69: error_fp = Local_File_Open (Error_File_Name, "w"); jpayne@69: jpayne@69: while (fgets (line, MAX_LINE, fp) != NULL) jpayne@69: { jpayne@69: int line_len = strlen (line); jpayne@69: jpayne@69: if (line [0] == '>') jpayne@69: { jpayne@69: if (! first_match) jpayne@69: { jpayne@69: total_errors += Extend_Forward (& ref_hi, & query_hi); jpayne@69: max_len = 1 + ref_hi - ref_lo; jpayne@69: if (1 + abs (query_hi - query_lo) > max_len) jpayne@69: max_len = 1 + abs (query_hi - query_lo); jpayne@69: error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf jpayne@69: ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", jpayne@69: ref_lo, ref_hi, jpayne@69: is_forward ? query_lo : 1 + Query_Len - query_lo, jpayne@69: is_forward ? query_hi : 1 + Query_Len - query_hi, jpayne@69: total_errors, max_len, 100.0 * error); jpayne@69: jpayne@69: match_ct ++; jpayne@69: match_total += 1 + ref_hi - ref_lo; jpayne@69: total_errors = 0; jpayne@69: Add_Coverage (& ref_cover_list, ref_lo, ref_hi); jpayne@69: if (is_forward) jpayne@69: Add_Coverage (& query_cover_list, query_lo, query_hi); jpayne@69: else jpayne@69: Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, jpayne@69: 1 + Query_Len - query_lo); jpayne@69: } jpayne@69: if (Tag_From_Fasta_Line) jpayne@69: { jpayne@69: char buff [MAX_LINE]; jpayne@69: char * p; jpayne@69: jpayne@69: strcpy (buff, line + 1); jpayne@69: p = strtok (buff, " >\t\n"); jpayne@69: if (strlen (p) > 0) jpayne@69: strcpy (query_tag, p); jpayne@69: else jpayne@69: { jpayne@69: fprintf (stderr, "No tag on line %s", line); jpayne@69: strcpy (query_tag, "???"); jpayne@69: } jpayne@69: } jpayne@69: is_forward = ! is_forward; jpayne@69: if (is_forward) jpayne@69: { jpayne@69: assert (Read_String (match_fp, & Query, & query_size, header)); jpayne@69: Query_Len = strlen (Query + 1); jpayne@69: if (Nucleotides_Only) jpayne@69: { jpayne@69: for (i = 1; i <= Query_Len; i ++) jpayne@69: switch (Query [i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: break; jpayne@69: default : jpayne@69: Query [i] = '1'; jpayne@69: } jpayne@69: } jpayne@69: } jpayne@69: else jpayne@69: Rev_Complement (Query + 1); jpayne@69: first_match = TRUE; jpayne@69: printf ("%s", line); jpayne@69: fprintf (error_fp, "%s", line); jpayne@69: } jpayne@69: else if (line [0] == '#') jpayne@69: { jpayne@69: if (! first_match) jpayne@69: { jpayne@69: total_errors += Extend_Forward (& ref_hi, & query_hi); jpayne@69: max_len = 1 + ref_hi - ref_lo; jpayne@69: if (1 + abs (query_hi - query_lo) > max_len) jpayne@69: max_len = 1 + abs (query_hi - query_lo); jpayne@69: error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf jpayne@69: ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", jpayne@69: ref_lo, ref_hi, jpayne@69: is_forward ? query_lo : 1 + Query_Len - query_lo, jpayne@69: is_forward ? query_hi : 1 + Query_Len - query_hi, jpayne@69: total_errors, max_len, 100.0 * error); jpayne@69: jpayne@69: match_ct ++; jpayne@69: match_total += 1 + ref_hi - ref_lo; jpayne@69: total_errors = 0; jpayne@69: Add_Coverage (& ref_cover_list, ref_lo, ref_hi); jpayne@69: if (is_forward) jpayne@69: Add_Coverage (& query_cover_list, query_lo, query_hi); jpayne@69: else jpayne@69: Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, jpayne@69: 1 + Query_Len - query_lo); jpayne@69: } jpayne@69: first_match = TRUE; jpayne@69: printf ("%s", line); jpayne@69: fprintf (error_fp, "%s", line); jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: assert (sscanf (line, "%ld %ld %ld", jpayne@69: & ref_pos, & query_pos, & match_len) == 3); jpayne@69: if (first_match) jpayne@69: { jpayne@69: ref_lo = ref_pos; jpayne@69: query_lo = query_pos; jpayne@69: total_errors += Extend_Backward (& ref_lo, & query_lo); jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf ("%s", line); jpayne@69: if (line_len > 0) jpayne@69: line [line_len - 1] = '\0'; jpayne@69: fprintf (error_fp, "%s %7s\n", line, "-"); jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: int errors; jpayne@69: int a_len, b_len; jpayne@69: int a_end, b_end, match_to_end; jpayne@69: int delta [MAX_ERRORS], delta_len; jpayne@69: jpayne@69: a_len = ref_pos - ref_hi - 1; jpayne@69: b_len = query_pos - query_hi - 1; jpayne@69: jpayne@69: errors = Prefix_Edit_Dist jpayne@69: (Ref + ref_hi + 1, a_len, jpayne@69: Query + query_hi + 1, b_len, jpayne@69: MAX_ERRORS - 1, & a_end, & b_end, jpayne@69: & match_to_end, delta, & delta_len, FALSE); jpayne@69: if (Show_Differences) jpayne@69: Show_Diffs (Ref + ref_hi + 1, a_end, jpayne@69: Query + query_hi + 1, b_end, jpayne@69: delta, delta_len, ref_hi + 1, query_hi + 1); jpayne@69: jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf ("%s", line); jpayne@69: jpayne@69: Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len, jpayne@69: DEFAULT_PAD); jpayne@69: Set_Right_Pad (right_pad, ref_pos, query_pos, match_len, jpayne@69: DEFAULT_PAD); jpayne@69: jpayne@69: if (a_end == 0 && b_end == 0) jpayne@69: { jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf (" No extension\n"); jpayne@69: jpayne@69: if (line_len > 0) jpayne@69: line [line_len - 1] = '\0'; jpayne@69: fprintf (error_fp, "%s %7s\n", line, "-"); jpayne@69: } jpayne@69: else if (Only_Difference_Positions) jpayne@69: { jpayne@69: int q_start, b_incr; jpayne@69: jpayne@69: if (is_forward) jpayne@69: { jpayne@69: q_start = query_hi + 1; jpayne@69: b_incr = 1; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: q_start = Query_Len - query_hi; jpayne@69: b_incr = -1; jpayne@69: } jpayne@69: Display_Difference_Positions jpayne@69: (Ref + ref_hi + 1, ref_hi + 1, a_end, jpayne@69: Query + query_hi + 1, q_start, b_end, jpayne@69: b_incr, delta, delta_len, ref_tag, jpayne@69: query_tag); jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: printf (" Errors = %d\n", errors); jpayne@69: if (! Only_Difference_Positions) jpayne@69: Display_Alignment_With_Pad jpayne@69: (Ref + ref_hi + 1, a_end, jpayne@69: Query + query_hi + 1, b_end, jpayne@69: delta, delta_len, left_pad, right_pad, jpayne@69: DEFAULT_PAD); jpayne@69: if (line_len > 0) jpayne@69: line [line_len - 1] = '\0'; jpayne@69: fprintf (error_fp, "%s %7d\n", line, errors); jpayne@69: } jpayne@69: jpayne@69: total_errors += errors; jpayne@69: jpayne@69: if (! match_to_end) jpayne@69: { jpayne@69: ref_hi += a_end; jpayne@69: query_hi += b_end; jpayne@69: max_len = 1 + ref_hi - ref_lo; jpayne@69: if (1 + abs (query_hi - query_lo) > max_len) jpayne@69: max_len = 1 + abs (query_hi - query_lo); jpayne@69: error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf ( jpayne@69: "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", jpayne@69: ref_lo, ref_hi, jpayne@69: is_forward ? query_lo : 1 + Query_Len - query_lo, jpayne@69: is_forward ? query_hi : 1 + Query_Len - query_hi, jpayne@69: total_errors, max_len, 100.0 * error); jpayne@69: jpayne@69: match_ct ++; jpayne@69: match_total += 1 + ref_hi - ref_lo; jpayne@69: total_errors = 0; jpayne@69: Add_Coverage (& ref_cover_list, ref_lo, ref_hi); jpayne@69: if (is_forward) jpayne@69: Add_Coverage (& query_cover_list, query_lo, query_hi); jpayne@69: else jpayne@69: Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, jpayne@69: 1 + Query_Len - query_lo); jpayne@69: ref_lo = ref_pos; jpayne@69: query_lo = query_pos; jpayne@69: total_errors += Extend_Backward (& ref_lo, & query_lo); jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: ref_hi = ref_pos + match_len - 1; jpayne@69: query_hi = query_pos + match_len - 1; jpayne@69: prior_match_len = match_len; jpayne@69: first_match = FALSE; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: if (! first_match) jpayne@69: { jpayne@69: total_errors += Extend_Forward (& ref_hi, & query_hi); jpayne@69: max_len = 1 + ref_hi - ref_lo; jpayne@69: if (1 + abs (query_hi - query_lo) > max_len) jpayne@69: max_len = 1 + abs (query_hi - query_lo); jpayne@69: error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len); jpayne@69: if (! Only_Difference_Positions) jpayne@69: printf jpayne@69: ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n", jpayne@69: ref_lo, ref_hi, jpayne@69: is_forward ? query_lo : 1 + Query_Len - query_lo, jpayne@69: is_forward ? query_hi : 1 + Query_Len - query_hi, jpayne@69: total_errors, max_len, 100.0 * error); jpayne@69: jpayne@69: match_ct ++; jpayne@69: match_total += 1 + ref_hi - ref_lo; jpayne@69: Add_Coverage (& ref_cover_list, ref_lo, ref_hi); jpayne@69: if (is_forward) jpayne@69: Add_Coverage (& query_cover_list, query_lo, query_hi); jpayne@69: else jpayne@69: Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi, jpayne@69: 1 + Query_Len - query_lo); jpayne@69: } jpayne@69: jpayne@69: fprintf (stderr, " Ref len = %ld\n", Ref_Len); jpayne@69: fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct); jpayne@69: fprintf (stderr, " Non acgt's = %d\n", Fill_Ct); jpayne@69: fprintf (stderr, " Number of matches = %d\n", match_ct); jpayne@69: fprintf (stderr, "Sum of match bases = %.0f\n", match_total); jpayne@69: fprintf (stderr, " Avg match bases = %.0f\n", jpayne@69: match_ct == 0 ? 0.0 : match_total / match_ct); jpayne@69: jpayne@69: fclose (error_fp); jpayne@69: jpayne@69: if (Output_Cover_Files) jpayne@69: { jpayne@69: strcpy (filename, ref_tag); jpayne@69: strcat (filename, ".cover"); jpayne@69: Show_Coverage (ref_cover_list, filename, ref_tag, Ref); jpayne@69: jpayne@69: strcpy (filename, query_tag); jpayne@69: strcat (filename, ".cover"); jpayne@69: Rev_Complement (Query + 1); jpayne@69: Show_Coverage (query_cover_list, filename, query_tag, Query); jpayne@69: } jpayne@69: jpayne@69: return 0; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Add_Coverage jpayne@69: (Cover_t * * list, long int lo, long int hi) jpayne@69: jpayne@69: // Add lo .. hi to list of regions covered in (* list) . jpayne@69: // Combine nodes when appropriate. jpayne@69: jpayne@69: { jpayne@69: Cover_t * new_node, * p, * prev = NULL; jpayne@69: jpayne@69: if ((* list) == NULL || hi + 1 < (* list) -> lo) jpayne@69: { jpayne@69: new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); jpayne@69: new_node -> lo = lo; jpayne@69: new_node -> hi = hi; jpayne@69: new_node -> next = (* list); jpayne@69: (* list) = new_node; jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next) jpayne@69: prev = p; jpayne@69: jpayne@69: if (p == NULL || hi + 1 < p -> lo) jpayne@69: { // insert between or on end jpayne@69: assert (prev != NULL); jpayne@69: new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t)); jpayne@69: new_node -> lo = lo; jpayne@69: new_node -> hi = hi; jpayne@69: new_node -> next = p; jpayne@69: prev -> next = new_node; jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: if (lo < p -> lo) jpayne@69: p -> lo = lo; jpayne@69: while (p -> next != NULL && hi + 1 >= p -> next -> lo) jpayne@69: { jpayne@69: Cover_t * save; jpayne@69: jpayne@69: p -> hi = p -> next -> hi; jpayne@69: save = p -> next; jpayne@69: p -> next = save -> next; jpayne@69: free (save); jpayne@69: } jpayne@69: if (hi > p -> hi) jpayne@69: p -> hi = hi; jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Binomial_Bound jpayne@69: (int e, double p, int Start, double Limit) jpayne@69: jpayne@69: // Return the smallest n >= Start s.t. jpayne@69: // prob [>= e errors in n binomial trials (p = error prob)] jpayne@69: // > Limit jpayne@69: jpayne@69: { jpayne@69: double Normal_Z, Mu_Power, Factorial, Poisson_Coeff; jpayne@69: double q, Sum, P_Power, Q_Power, X; jpayne@69: int k, n, Bin_Coeff, Ct; jpayne@69: jpayne@69: q = 1.0 - p; jpayne@69: if (Start < e) jpayne@69: Start = e; jpayne@69: jpayne@69: for (n = Start; n < MAX_FRAG_LEN; n ++) jpayne@69: { jpayne@69: if (n <= 35) jpayne@69: { jpayne@69: Sum = 0.0; jpayne@69: Bin_Coeff = 1; jpayne@69: Ct = 0; jpayne@69: P_Power = 1.0; jpayne@69: Q_Power = pow (q, (double) n); jpayne@69: jpayne@69: for (k = 0; k < e && 1.0 - Sum > Limit; k ++) jpayne@69: { jpayne@69: X = Bin_Coeff * P_Power * Q_Power; jpayne@69: Sum += X; jpayne@69: Bin_Coeff *= n - Ct; jpayne@69: Bin_Coeff /= ++ Ct; jpayne@69: P_Power *= p; jpayne@69: Q_Power /= q; jpayne@69: } jpayne@69: if (1.0 - Sum > Limit) jpayne@69: return n; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q)); jpayne@69: if (Normal_Z <= NORMAL_DISTRIB_THOLD) jpayne@69: return n; jpayne@69: Sum = 0.0; jpayne@69: Mu_Power = 1.0; jpayne@69: Factorial = 1.0; jpayne@69: Poisson_Coeff = exp (- (double) n * p); jpayne@69: for (k = 0; k < e; k ++) jpayne@69: { jpayne@69: Sum += Mu_Power * Poisson_Coeff / Factorial; jpayne@69: Mu_Power *= n * p; jpayne@69: Factorial *= k + 1; jpayne@69: } jpayne@69: if (1.0 - Sum > Limit) jpayne@69: return n; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: return MAX_FRAG_LEN; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: #define DISPLAY_WIDTH 60 jpayne@69: jpayne@69: void Display_Alignment jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) jpayne@69: jpayne@69: // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] jpayne@69: // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m, top_len, bottom_len; jpayne@69: char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: jpayne@69: i = j = top_len = bottom_len = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: top [top_len ++] = a [i ++]; jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: top [top_len ++] = '-'; jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: top [top_len ++] = a [i ++]; jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: top [top_len ++] = a [i ++]; jpayne@69: j ++; jpayne@69: } jpayne@69: top [top_len] = '\0'; jpayne@69: jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [j ++]; jpayne@69: i ++; jpayne@69: } jpayne@69: if (delta [k] > 0) jpayne@69: { jpayne@69: bottom [bottom_len ++] = '-'; jpayne@69: i ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [j ++]; jpayne@69: } jpayne@69: } jpayne@69: while (j < b_len && i < a_len) jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [j ++]; jpayne@69: i ++; jpayne@69: } jpayne@69: bottom [bottom_len] = '\0'; jpayne@69: jpayne@69: jpayne@69: for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) jpayne@69: { jpayne@69: printf ("A: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) jpayne@69: putchar (top [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf ("B: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) jpayne@69: putchar (bottom [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf (" "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; jpayne@69: j ++) jpayne@69: if (top [i + j] != ' ' && bottom [i + j] != ' ' jpayne@69: && top [i + j] != bottom [i + j]) jpayne@69: putchar ('^'); jpayne@69: else jpayne@69: putchar (' '); jpayne@69: putchar ('\n'); jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Display_Alignment_With_Pad jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: char * left_pad [3], char * right_pad [3], int pad_len) jpayne@69: jpayne@69: // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] jpayne@69: // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] . jpayne@69: // Attach pad strings in left_pad to the beginning of the alignment jpayne@69: // and the strings in right_pad to the end of the alignment. jpayne@69: // pad_len is the width of the pad strings jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m, top_len, bottom_len; jpayne@69: #if 0 jpayne@69: char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: #else jpayne@69: string top, bottom; jpayne@69: #endif jpayne@69: jpayne@69: for (k = 0; k < pad_len; k ++) jpayne@69: { jpayne@69: top . push_back (left_pad [0] [k]); jpayne@69: bottom . push_back (left_pad [1] [k]); jpayne@69: } jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: top . push_back (a [i ++]); jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: top . push_back ('-'); jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: top . push_back (a [i ++]); jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: top . push_back (a [i ++]); jpayne@69: j ++; jpayne@69: } jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: bottom . push_back (b [j ++]); jpayne@69: i ++; jpayne@69: } jpayne@69: if (delta [k] > 0) jpayne@69: { jpayne@69: bottom . push_back ('-'); jpayne@69: i ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: bottom . push_back (b [j ++]); jpayne@69: } jpayne@69: } jpayne@69: while (j < b_len && i < a_len) jpayne@69: { jpayne@69: bottom . push_back (b [j ++]); jpayne@69: i ++; jpayne@69: } jpayne@69: jpayne@69: for (k = 0; k < pad_len; k ++) jpayne@69: { jpayne@69: top . push_back (right_pad [0] [k]); jpayne@69: bottom . push_back (right_pad [1] [k]); jpayne@69: } jpayne@69: jpayne@69: top_len = top . length (); jpayne@69: bottom_len = bottom . length (); jpayne@69: jpayne@69: if (top_len != bottom_len) jpayne@69: { jpayne@69: fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n", jpayne@69: top_len, bottom_len); jpayne@69: exit (EXIT_FAILURE); jpayne@69: } jpayne@69: jpayne@69: for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) jpayne@69: { jpayne@69: printf ("A: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) jpayne@69: putchar (top [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf ("B: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) jpayne@69: putchar (bottom [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf (" "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; jpayne@69: j ++) jpayne@69: if (i + j < pad_len) jpayne@69: putchar (left_pad [2] [i + j]); jpayne@69: else if (top_len - i - j <= pad_len) jpayne@69: putchar (right_pad [2] [i + j - (top_len - pad_len)]); jpayne@69: else if (top [i + j] != ' ' && bottom [i + j] != ' ' jpayne@69: && top [i + j] != bottom [i + j]) jpayne@69: putchar ('^'); jpayne@69: else jpayne@69: putchar (' '); jpayne@69: putchar ('\n'); jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Display_Difference_Positions jpayne@69: (char * a, int a_start, int a_len, char * b, int b_start, int b_len, jpayne@69: int b_incr, int delta [], int delta_ct, const char * a_tag, jpayne@69: const char * b_tag) jpayne@69: jpayne@69: // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)] jpayne@69: // between strings a [0 .. (a_len - 1)] and jpayne@69: // b [0 .. (b_len - 1))] . The positions of the starts of the strings jpayne@69: // are at a_start and b_start , respectively, and b_incr indicates jpayne@69: // if the b postions increase or decrease. b_incr must be 1 or -1. jpayne@69: // Use a_tag and b_tag to label the positions. jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m; jpayne@69: jpayne@69: assert (b_incr == 1 || b_incr == -1); jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: if (a [i] != b [j]) jpayne@69: printf ("%-15s %-15s %8d %8d %c %c\n", jpayne@69: a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: printf ("%-15s %-15s %8d %8d %c %c\n", jpayne@69: a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]); jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: if (b_incr > 0) jpayne@69: printf ("%-15s %-15s %8d %8d %c %c\n", jpayne@69: a_tag, b_tag, a_start + i, b_start + j, a [i], '-'); jpayne@69: else jpayne@69: printf ("%-15s %-15s %8d %8d %c %c\n", jpayne@69: a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-'); jpayne@69: i ++; jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: if (a [i] != b [j]) jpayne@69: printf ("%-15s %-15s %8d %8d %c %c\n", jpayne@69: a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Extend_Backward jpayne@69: (long int * ref_lo, long int * query_lo) jpayne@69: jpayne@69: // Do edit distance off end of match beginning at (* ref_lo) and jpayne@69: // (* query_lo) in the reference and query sequences, resp., jpayne@69: // in the reverse direction. jpayne@69: // Return the number of errors in the extension and set (* ref_hi) jpayne@69: // and (* query_hi) to the end of the extension. jpayne@69: jpayne@69: { jpayne@69: int ct = 0, errors, sum = 0; jpayne@69: int a_len, b_len; jpayne@69: int a_end, b_end, match_to_end; jpayne@69: int delta [MAX_ERRORS], delta_len; jpayne@69: jpayne@69: do jpayne@69: { jpayne@69: a_len = (* ref_lo) - 1; jpayne@69: if (a_len > MAX_EXTENSION) jpayne@69: a_len = MAX_EXTENSION; jpayne@69: b_len = (* query_lo) - 1; jpayne@69: if (b_len > MAX_EXTENSION) jpayne@69: b_len = MAX_EXTENSION; jpayne@69: jpayne@69: errors = Rev_Prefix_Edit_Dist jpayne@69: (Ref + (* ref_lo) - 1, a_len, jpayne@69: Query + (* query_lo) - 1, b_len, jpayne@69: MAX_ERRORS - 1, & a_end, & b_end, jpayne@69: & match_to_end, delta, & delta_len, TRUE); jpayne@69: if (Show_Differences) jpayne@69: Rev_Show_Diffs jpayne@69: (Ref + (* ref_lo - 1), a_end, jpayne@69: Query + (* query_lo - 1), b_end, jpayne@69: delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1); jpayne@69: if (Verbose > 0) jpayne@69: { jpayne@69: printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", jpayne@69: ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len, jpayne@69: errors, match_to_end ? 'T' : 'F', a_end, b_end); jpayne@69: Rev_Display_Alignment jpayne@69: (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end, jpayne@69: delta, delta_len); jpayne@69: } jpayne@69: jpayne@69: (* ref_lo) -= a_end; jpayne@69: (* query_lo) -= b_end; jpayne@69: sum += errors; jpayne@69: } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); jpayne@69: jpayne@69: return sum; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Extend_Forward jpayne@69: (long int * ref_hi, long int * query_hi) jpayne@69: jpayne@69: // Do edit distance off end of match ending at (* ref_hi) and jpayne@69: // (* query_hi) in the reference and query sequences, resp. jpayne@69: // Return the number of errors in the extension and set (* ref_hi) jpayne@69: // and (* query_hi) to the end of the extension. jpayne@69: jpayne@69: { jpayne@69: int ct = 0, errors, sum = 0; jpayne@69: int a_end, b_end, match_to_end; jpayne@69: int a_len, b_len; jpayne@69: int delta [MAX_ERRORS], delta_len; jpayne@69: jpayne@69: do jpayne@69: { jpayne@69: a_len = Ref_Len - (* ref_hi); jpayne@69: if (a_len > MAX_EXTENSION) jpayne@69: a_len = MAX_EXTENSION; jpayne@69: b_len = Query_Len - (* query_hi); jpayne@69: if (b_len > MAX_EXTENSION) jpayne@69: b_len = MAX_EXTENSION; jpayne@69: jpayne@69: errors = Prefix_Edit_Dist jpayne@69: (Ref + (* ref_hi) + 1, a_len, jpayne@69: Query + (* query_hi) + 1, b_len, jpayne@69: MAX_ERRORS - 1, & a_end, & b_end, jpayne@69: & match_to_end, delta, & delta_len, TRUE); jpayne@69: if (Show_Differences) jpayne@69: Show_Diffs (Ref + (* ref_hi) + 1, a_end, jpayne@69: Query + (* query_hi) + 1, b_end, jpayne@69: delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1); jpayne@69: if (Verbose > 0) jpayne@69: { jpayne@69: printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n", jpayne@69: ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len, jpayne@69: errors, match_to_end ? 'T' : 'F', a_end, b_end); jpayne@69: Display_Alignment (Ref + (* ref_hi) + 1, a_end, jpayne@69: Query + (* query_hi) + 1, b_end, jpayne@69: delta, delta_len); jpayne@69: } jpayne@69: jpayne@69: (* ref_hi) += a_end; jpayne@69: (* query_hi) += b_end; jpayne@69: sum += errors; jpayne@69: } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION); jpayne@69: jpayne@69: return sum; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: void Initialize_Globals jpayne@69: (void) jpayne@69: jpayne@69: // Initialize global variables used in this program jpayne@69: jpayne@69: { jpayne@69: int i, offset, del; jpayne@69: int e, start; jpayne@69: jpayne@69: offset = 2; jpayne@69: del = 6; jpayne@69: for (i = 0; i < MAX_ERRORS; i ++) jpayne@69: { jpayne@69: Edit_Array [i] = Edit_Space + offset; jpayne@69: offset += del; jpayne@69: del += 2; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: for (i = 0; i <= ERRORS_FOR_FREE; i ++) jpayne@69: Edit_Match_Limit [i] = 0; jpayne@69: jpayne@69: start = 1; jpayne@69: for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++) jpayne@69: { jpayne@69: start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE, jpayne@69: start, EDIT_DIST_PROB_BOUND); jpayne@69: Edit_Match_Limit [e] = start - 1; jpayne@69: assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]); jpayne@69: } jpayne@69: jpayne@69: for (i = 0; i <= MAX_FRAG_LEN; i ++) jpayne@69: Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE)); jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: FILE * Local_File_Open jpayne@69: (const char * filename, const char * mode) jpayne@69: jpayne@69: /* Open Filename in Mode and return a pointer to its control jpayne@69: * block. If fail, print a message and exit. */ jpayne@69: jpayne@69: { jpayne@69: FILE * fp; jpayne@69: jpayne@69: fp = fopen (filename, mode); jpayne@69: if (fp == NULL) jpayne@69: { jpayne@69: fprintf (stderr, "ERROR %d: Could not open file %s \n", jpayne@69: errno, filename); jpayne@69: perror (strerror (errno)); jpayne@69: exit (EXIT_FAILURE); jpayne@69: } jpayne@69: jpayne@69: return fp; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Max_int jpayne@69: (int a, int b) jpayne@69: jpayne@69: // Return the larger of a and b . jpayne@69: jpayne@69: { jpayne@69: if (a < b) jpayne@69: return b; jpayne@69: jpayne@69: return a; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Min_int jpayne@69: (int a, int b) jpayne@69: jpayne@69: // Return the smaller of a and b . jpayne@69: jpayne@69: { jpayne@69: if (a < b) jpayne@69: return a; jpayne@69: jpayne@69: return b; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Parse_Command_Line jpayne@69: (int argc, char * argv []) jpayne@69: jpayne@69: // Get options and parameters from command line with argc jpayne@69: // arguments in argv [0 .. (argc - 1)] . jpayne@69: jpayne@69: { jpayne@69: int ch, errflg = FALSE; jpayne@69: jpayne@69: optarg = NULL; jpayne@69: jpayne@69: while (! errflg jpayne@69: && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF)) jpayne@69: switch (ch) jpayne@69: { jpayne@69: case 'D' : jpayne@69: Only_Difference_Positions = true; jpayne@69: break; jpayne@69: jpayne@69: case 'e' : jpayne@69: Percent_ID = 1.0 - atof (optarg); jpayne@69: UserScoring = TRUE; jpayne@69: break; jpayne@69: jpayne@69: case 'n' : jpayne@69: Nucleotides_Only = TRUE; jpayne@69: break; jpayne@69: jpayne@69: case 'N' : jpayne@69: Consec_Non_ACGT = strtol (optarg, NULL, 10); jpayne@69: break; jpayne@69: jpayne@69: case 'q' : jpayne@69: Query_Suffix = optarg; jpayne@69: break; jpayne@69: jpayne@69: case 'r' : jpayne@69: Ref_Suffix = optarg; jpayne@69: break; jpayne@69: jpayne@69: case 'S' : jpayne@69: Show_Differences = TRUE; jpayne@69: break; jpayne@69: jpayne@69: case 't' : jpayne@69: Tag_From_Fasta_Line = TRUE; jpayne@69: break; jpayne@69: jpayne@69: case 'v' : jpayne@69: Verbose = strtol (optarg, NULL, 10); jpayne@69: break; jpayne@69: jpayne@69: case 'x' : jpayne@69: Output_Cover_Files = FALSE; jpayne@69: break; jpayne@69: jpayne@69: case 'W' : jpayne@69: Error_File_Name = optarg; jpayne@69: break; jpayne@69: jpayne@69: case '?' : jpayne@69: fprintf (stderr, "Unrecognized option -%c\n", optopt); jpayne@69: jpayne@69: default : jpayne@69: errflg = TRUE; jpayne@69: } jpayne@69: jpayne@69: if (errflg || optind != argc - 3) jpayne@69: { jpayne@69: Usage (argv [0]); jpayne@69: exit (EXIT_FAILURE); jpayne@69: } jpayne@69: jpayne@69: Ref_File_Path = argv [optind ++]; jpayne@69: jpayne@69: Match_File_Path = argv [optind ++]; jpayne@69: jpayne@69: Gaps_File_Path = argv [optind ++]; jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Prefix_Edit_Dist jpayne@69: (char A [], int m, char T [], int n, int Error_Limit, jpayne@69: int * A_End, int * T_End, int * Match_To_End, jpayne@69: int Delta [MAX_ERRORS], int * Delta_Len, int extending) jpayne@69: jpayne@69: // Return the minimum number of changes (inserts, deletes, replacements) jpayne@69: // needed to match string A [0 .. (m-1)] with a prefix of string jpayne@69: // T [0 .. (n-1)] if it's not more than Error_Limit . jpayne@69: // Put delta description of alignment in Delta and set jpayne@69: // (* Delta_Len) to the number of entries there if it's a complete jpayne@69: // match. jpayne@69: // Set A_End and T_End to the rightmost positions where the jpayne@69: // alignment ended in A and T , respectively. jpayne@69: // Set Match_To_End true if the match extended to the end jpayne@69: // of at least one string; otherwise, set it false to indicate jpayne@69: // a branch point. jpayne@69: // extending indicates whether the match is for a fixed region (FALSE) jpayne@69: // or is extending off the end of a match as far as possible (TRUE) jpayne@69: jpayne@69: { jpayne@69: double Score, Max_Score; jpayne@69: int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; jpayne@69: int Best_d, Best_e, Longest, Row; jpayne@69: int Left, Right; jpayne@69: int d, e, i, j, shorter; jpayne@69: jpayne@69: // assert (m <= n); jpayne@69: Best_d = Best_e = Longest = 0; jpayne@69: (* Delta_Len) = 0; jpayne@69: jpayne@69: if (Consec_Non_ACGT > 0) jpayne@69: { jpayne@69: int ct; jpayne@69: jpayne@69: for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) jpayne@69: { jpayne@69: switch (A [i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: ct = 0; jpayne@69: break; jpayne@69: default : jpayne@69: ct ++; jpayne@69: } jpayne@69: } jpayne@69: if (ct >= Consec_Non_ACGT) jpayne@69: { jpayne@69: m = i - ct; jpayne@69: extending = TRUE; jpayne@69: } jpayne@69: jpayne@69: for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) jpayne@69: { jpayne@69: switch (T [i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: ct = 0; jpayne@69: break; jpayne@69: default : jpayne@69: ct ++; jpayne@69: } jpayne@69: } jpayne@69: if (ct >= Consec_Non_ACGT) jpayne@69: { jpayne@69: n = i - ct; jpayne@69: extending = TRUE; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: shorter = Min_int (m, n); jpayne@69: for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++) jpayne@69: ; jpayne@69: jpayne@69: Edit_Array [0] [0] = Row; jpayne@69: jpayne@69: if (Row == m && Row == n) // Exact match jpayne@69: { jpayne@69: (* Delta_Len) = 0; jpayne@69: (* A_End) = (* T_End) = Row; jpayne@69: (* Match_To_End) = ! extending; jpayne@69: return 0; jpayne@69: } jpayne@69: jpayne@69: Left = Right = 0; jpayne@69: Max_Score = Row * Branch_Pt_Match_Value; jpayne@69: Max_Score_Len = Row; jpayne@69: Max_Score_Best_d = 0; jpayne@69: Max_Score_Best_e = 0; jpayne@69: jpayne@69: for (e = 1; e <= Error_Limit; e ++) jpayne@69: { jpayne@69: int cutoff; jpayne@69: jpayne@69: Left = Max_int (Left - 1, -e); jpayne@69: Right = Min_int (Right + 1, e); jpayne@69: Edit_Array [e - 1] [Left] = -2; jpayne@69: Edit_Array [e - 1] [Left - 1] = -2; jpayne@69: Edit_Array [e - 1] [Right] = -2; jpayne@69: Edit_Array [e - 1] [Right + 1] = -2; jpayne@69: jpayne@69: for (d = Left; d <= Right; d ++) jpayne@69: { jpayne@69: Row = 1 + Edit_Array [e - 1] [d]; jpayne@69: if ((j = Edit_Array [e - 1] [d - 1]) > Row) jpayne@69: Row = j; jpayne@69: if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) jpayne@69: Row = j; jpayne@69: while (Row < m && Row + d < n jpayne@69: && A [Row] == T [Row + d]) jpayne@69: Row ++; jpayne@69: jpayne@69: Edit_Array [e] [d] = Row; jpayne@69: jpayne@69: if (Row == m && Row + d == n) jpayne@69: { jpayne@69: if (extending) jpayne@69: { jpayne@69: for (j = Left; j <= d; j ++) jpayne@69: if (Edit_Array [e] [j] > Longest) jpayne@69: { jpayne@69: Best_d = j; jpayne@69: Longest = Edit_Array [e] [j]; jpayne@69: } jpayne@69: Score = Longest * Branch_Pt_Match_Value - e; jpayne@69: if (Score < Max_Score) jpayne@69: { jpayne@69: (* A_End) = Max_Score_Len; jpayne@69: (* T_End) = Max_Score_Len + Max_Score_Best_d; jpayne@69: Set_Deltas (Delta, Delta_Len, Max_Score_Len, jpayne@69: Max_Score_Best_d, Max_Score_Best_e); jpayne@69: (* Match_To_End) = FALSE; jpayne@69: return Max_Score_Best_e; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: (* A_End) = Row; // One past last align position jpayne@69: (* T_End) = Row + d; jpayne@69: Set_Deltas (Delta, Delta_Len, Row, d, e); jpayne@69: (* Match_To_End) = ! extending; jpayne@69: return e; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: cutoff = Longest - GIVE_UP_LEN; jpayne@69: while (Left <= Right && Edit_Array [e] [Left] < cutoff) jpayne@69: Left ++; jpayne@69: if (Left > Right) jpayne@69: break; jpayne@69: while (Right > Left && Edit_Array [e] [Right] < cutoff) jpayne@69: Right --; jpayne@69: jpayne@69: assert (Left <= Right); jpayne@69: jpayne@69: for (d = Left; d <= Right; d ++) jpayne@69: if (Edit_Array [e] [d] > Longest) jpayne@69: { jpayne@69: Best_d = d; jpayne@69: Best_e = e; jpayne@69: Longest = Edit_Array [e] [d]; jpayne@69: } jpayne@69: jpayne@69: Score = Longest * Branch_Pt_Match_Value - e; jpayne@69: // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 jpayne@69: if (Score > Max_Score) jpayne@69: { jpayne@69: Max_Score = Score; jpayne@69: Max_Score_Len = Longest; jpayne@69: Max_Score_Best_d = Best_d; jpayne@69: Max_Score_Best_e = Best_e; jpayne@69: } jpayne@69: jpayne@69: if (Longest - Max_Score_Len >= GIVE_UP_LEN) jpayne@69: break; jpayne@69: } jpayne@69: jpayne@69: (* A_End) = Max_Score_Len; jpayne@69: (* T_End) = Max_Score_Len + Max_Score_Best_d; jpayne@69: Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); jpayne@69: (* Match_To_End) = FALSE; jpayne@69: jpayne@69: return Max_Score_Best_e; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Read_String jpayne@69: (FILE * fp, char * * T, long int * Size, char header []) jpayne@69: jpayne@69: /* Read next string from fp (assuming FASTA format) into (* T) [1 ..] jpayne@69: * which has Size characters. Allocate extra memory if needed jpayne@69: * and adjust Size accordingly. Return TRUE if successful, FALSE jpayne@69: * otherwise (e.g., EOF). Set header to the contents of the FASTA jpayne@69: * header line. */ jpayne@69: jpayne@69: { jpayne@69: char Line [MAX_LINE]; jpayne@69: long int Len; jpayne@69: int Ch, Ct; jpayne@69: jpayne@69: if (feof (fp)) jpayne@69: return FALSE; jpayne@69: jpayne@69: while ((Ch = fgetc (fp)) != EOF && Ch != '>') jpayne@69: ; jpayne@69: jpayne@69: if (Ch != '>') jpayne@69: return FALSE; jpayne@69: jpayne@69: fgets (Line, MAX_LINE, fp); jpayne@69: Len = strlen (Line); jpayne@69: assert (Len > 0 && Line [Len - 1] == '\n'); jpayne@69: Line [Len - 1] = '\0'; jpayne@69: strcpy (header, Line); jpayne@69: jpayne@69: jpayne@69: Ct = 0; jpayne@69: Len = 0; jpayne@69: while ((Ch = fgetc (fp)) != EOF && Ch != '>') jpayne@69: { jpayne@69: if (isspace (Ch)) jpayne@69: continue; jpayne@69: jpayne@69: Ct ++; jpayne@69: jpayne@69: if (Ct >= (* Size) - 1) jpayne@69: { jpayne@69: (* Size) = (long int) ((* Size) * EXPANSION_FACTOR); jpayne@69: (* T) = (char *) Safe_realloc ((* T), (* Size)); jpayne@69: } jpayne@69: Ch = tolower (Ch); jpayne@69: if (! isalpha (Ch)) jpayne@69: { jpayne@69: fprintf (stderr, "Unexpected character `%c\' in string %s\n", jpayne@69: Ch, header); jpayne@69: Ch = 'x'; jpayne@69: } jpayne@69: (* T) [++ Len] = Ch; jpayne@69: } jpayne@69: jpayne@69: (* T) [Len + 1] = '\0'; jpayne@69: if (Ch == '>') jpayne@69: ungetc (Ch, fp); jpayne@69: jpayne@69: return TRUE; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Rev_Complement jpayne@69: (char * s) jpayne@69: jpayne@69: /* Set string s to its DNA reverse complement. */ jpayne@69: jpayne@69: { jpayne@69: char ch; jpayne@69: int i, j, len; jpayne@69: jpayne@69: len = strlen (s); jpayne@69: jpayne@69: for (i = 0, j = len - 1; i < j; i ++, j --) jpayne@69: { jpayne@69: ch = Complement (s [i]); jpayne@69: s [i] = Complement (s [j]); jpayne@69: s [j] = ch; jpayne@69: } jpayne@69: jpayne@69: if (i == j) jpayne@69: s [i] = Complement (s [i]); jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Rev_Display_Alignment jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct) jpayne@69: jpayne@69: // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)] jpayne@69: // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] jpayne@69: // in the reverse direction. jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m, top_len, bottom_len; jpayne@69: char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1]; jpayne@69: jpayne@69: i = j = top_len = bottom_len = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: top [top_len ++] = a [- i ++]; jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: top [top_len ++] = '-'; jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: top [top_len ++] = a [- i ++]; jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: top [top_len ++] = a [- i ++]; jpayne@69: j ++; jpayne@69: } jpayne@69: top [top_len] = '\0'; jpayne@69: jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [- j ++]; jpayne@69: i ++; jpayne@69: } jpayne@69: if (delta [k] > 0) jpayne@69: { jpayne@69: bottom [bottom_len ++] = '-'; jpayne@69: i ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [- j ++]; jpayne@69: } jpayne@69: } jpayne@69: while (j < b_len && i < a_len) jpayne@69: { jpayne@69: bottom [bottom_len ++] = b [- j ++]; jpayne@69: i ++; jpayne@69: } jpayne@69: bottom [bottom_len] = '\0'; jpayne@69: jpayne@69: jpayne@69: for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH) jpayne@69: { jpayne@69: putchar ('\n'); jpayne@69: printf ("A: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++) jpayne@69: putchar (top [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf ("B: "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++) jpayne@69: putchar (bottom [i + j]); jpayne@69: putchar ('\n'); jpayne@69: printf (" "); jpayne@69: for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len; jpayne@69: j ++) jpayne@69: if (top [i + j] != ' ' && bottom [i + j] != ' ' jpayne@69: && top [i + j] != bottom [i + j]) jpayne@69: putchar ('^'); jpayne@69: else jpayne@69: putchar (' '); jpayne@69: putchar ('\n'); jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Rev_Prefix_Edit_Dist jpayne@69: (char A [], int m, char T [], int n, int Error_Limit, jpayne@69: int * A_End, int * T_End, int * Match_To_End, jpayne@69: int Delta [MAX_ERRORS], int * Delta_Len, int extending) jpayne@69: jpayne@69: // Return the minimum number of changes (inserts, deletes, replacements) jpayne@69: // needed to match string A [0 .. -(m-1)] with a prefix of string jpayne@69: // T [0 .. -(n-1)] if it's not more than Error_Limit . jpayne@69: // Note: Match is reverse direction, right to left. jpayne@69: // Put delta description of alignment in Delta and set jpayne@69: // (* Delta_Len) to the number of entries there if it's a complete jpayne@69: // match. jpayne@69: // Set A_End and T_End to the rightmost positions where the jpayne@69: // alignment ended in A and T , respectively. jpayne@69: // Set Match_To_End true if the match extended to the end jpayne@69: // of at least one string; otherwise, set it false to indicate jpayne@69: // a branch point. jpayne@69: // extending indicates whether the match is for a fixed region (FALSE) jpayne@69: // or is extending off the end of a match as far as possible (TRUE) jpayne@69: jpayne@69: { jpayne@69: double Score, Max_Score; jpayne@69: int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0; jpayne@69: int Best_d, Best_e, Longest, Row; jpayne@69: int Left, Right; jpayne@69: int d, e, i, j, shorter; jpayne@69: jpayne@69: // assert (m <= n); jpayne@69: Best_d = Best_e = Longest = 0; jpayne@69: (* Delta_Len) = 0; jpayne@69: jpayne@69: if (Consec_Non_ACGT > 0) jpayne@69: { jpayne@69: int ct; jpayne@69: jpayne@69: for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++) jpayne@69: { jpayne@69: switch (A [- i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: ct = 0; jpayne@69: break; jpayne@69: default : jpayne@69: ct ++; jpayne@69: } jpayne@69: } jpayne@69: if (ct >= Consec_Non_ACGT) jpayne@69: { jpayne@69: m = i - ct; jpayne@69: extending = TRUE; jpayne@69: } jpayne@69: jpayne@69: for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++) jpayne@69: { jpayne@69: switch (T [- i]) jpayne@69: { jpayne@69: case 'a' : jpayne@69: case 'c' : jpayne@69: case 'g' : jpayne@69: case 't' : jpayne@69: ct = 0; jpayne@69: break; jpayne@69: default : jpayne@69: ct ++; jpayne@69: } jpayne@69: } jpayne@69: if (ct >= Consec_Non_ACGT) jpayne@69: { jpayne@69: n = i - ct; jpayne@69: extending = TRUE; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: shorter = Min_int (m, n); jpayne@69: for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++) jpayne@69: ; jpayne@69: jpayne@69: Edit_Array [0] [0] = Row; jpayne@69: jpayne@69: if (Row == m && Row == n) // Exact match jpayne@69: { jpayne@69: (* Delta_Len) = 0; jpayne@69: (* A_End) = (* T_End) = Row; jpayne@69: (* Match_To_End) = ! extending; jpayne@69: return 0; jpayne@69: } jpayne@69: jpayne@69: Left = Right = 0; jpayne@69: Max_Score = Row * Branch_Pt_Match_Value; jpayne@69: Max_Score_Len = Row; jpayne@69: Max_Score_Best_d = 0; jpayne@69: Max_Score_Best_e = 0; jpayne@69: jpayne@69: for (e = 1; e <= Error_Limit; e ++) jpayne@69: { jpayne@69: int cutoff; jpayne@69: jpayne@69: Left = Max_int (Left - 1, -e); jpayne@69: Right = Min_int (Right + 1, e); jpayne@69: Edit_Array [e - 1] [Left] = -2; jpayne@69: Edit_Array [e - 1] [Left - 1] = -2; jpayne@69: Edit_Array [e - 1] [Right] = -2; jpayne@69: Edit_Array [e - 1] [Right + 1] = -2; jpayne@69: jpayne@69: for (d = Left; d <= Right; d ++) jpayne@69: { jpayne@69: Row = 1 + Edit_Array [e - 1] [d]; jpayne@69: if ((j = Edit_Array [e - 1] [d - 1]) > Row) jpayne@69: Row = j; jpayne@69: if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row) jpayne@69: Row = j; jpayne@69: while (Row < m && Row + d < n jpayne@69: && A [- Row] == T [- Row - d]) jpayne@69: Row ++; jpayne@69: jpayne@69: Edit_Array [e] [d] = Row; jpayne@69: jpayne@69: if (Row == m && Row + d == n) jpayne@69: { jpayne@69: if (extending) jpayne@69: { jpayne@69: for (j = Left; j <= d; j ++) jpayne@69: if (Edit_Array [e] [j] > Longest) jpayne@69: { jpayne@69: Best_d = j; jpayne@69: Longest = Edit_Array [e] [j]; jpayne@69: } jpayne@69: Score = Longest * Branch_Pt_Match_Value - e; jpayne@69: if (Score < Max_Score) jpayne@69: { jpayne@69: (* A_End) = Max_Score_Len; jpayne@69: (* T_End) = Max_Score_Len + Max_Score_Best_d; jpayne@69: Set_Deltas (Delta, Delta_Len, Max_Score_Len, jpayne@69: Max_Score_Best_d, Max_Score_Best_e); jpayne@69: (* Match_To_End) = FALSE; jpayne@69: return Max_Score_Best_e; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: (* A_End) = Row; // One past last align position jpayne@69: (* T_End) = Row + d; jpayne@69: Set_Deltas (Delta, Delta_Len, Row, d, e); jpayne@69: (* Match_To_End) = ! extending; jpayne@69: return e; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: cutoff = Longest - GIVE_UP_LEN; jpayne@69: while (Left <= Right && Edit_Array [e] [Left] < cutoff) jpayne@69: Left ++; jpayne@69: if (Left > Right) jpayne@69: break; jpayne@69: while (Right > Left && Edit_Array [e] [Right] < cutoff) jpayne@69: Right --; jpayne@69: assert (Left <= Right); jpayne@69: jpayne@69: for (d = Left; d <= Right; d ++) jpayne@69: if (Edit_Array [e] [d] > Longest) jpayne@69: { jpayne@69: Best_d = d; jpayne@69: Best_e = e; jpayne@69: Longest = Edit_Array [e] [d]; jpayne@69: } jpayne@69: jpayne@69: Score = Longest * Branch_Pt_Match_Value - e; jpayne@69: // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0 jpayne@69: if (Score > Max_Score) jpayne@69: { jpayne@69: Max_Score = Score; jpayne@69: Max_Score_Len = Longest; jpayne@69: Max_Score_Best_d = Best_d; jpayne@69: Max_Score_Best_e = Best_e; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: if (Longest - Max_Score_Len >= GIVE_UP_LEN) jpayne@69: break; jpayne@69: } jpayne@69: jpayne@69: (* A_End) = Max_Score_Len; jpayne@69: (* T_End) = Max_Score_Len + Max_Score_Best_d; jpayne@69: Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e); jpayne@69: (* Match_To_End) = FALSE; jpayne@69: jpayne@69: return Max_Score_Best_e; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Rev_Show_Diffs jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: long int a_ref, long int b_ref) jpayne@69: jpayne@69: // Show (to stdout ) the differences jpayne@69: // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] jpayne@69: // encoded in delta [0 .. (delta_ct - 1)] jpayne@69: // in the reverse direction. a_ref is the position of the beginning jpayne@69: // of string a . b_ref is the offset to the start of jpayne@69: // string b . jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m; jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: if (a [- i] != b [- j]) jpayne@69: printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]); jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]); jpayne@69: i ++; jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: if (a [- i] != b [- j]) jpayne@69: printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Set_Deltas jpayne@69: (int delta [], int * delta_len, int row, int d, int e) jpayne@69: jpayne@69: // Set delta to values that show the alignment recorded in jpayne@69: // global Edit_Array . Set (* delta_len) to the number of jpayne@69: // entries set. row is the length of the match in the first jpayne@69: // string. d is the offset at the end of the match between jpayne@69: // the first and second string. e is the number of errors. jpayne@69: jpayne@69: { jpayne@69: int delta_stack [MAX_ERRORS]; jpayne@69: int from, last, max; jpayne@69: int i, j, k; jpayne@69: jpayne@69: last = row; jpayne@69: (* delta_len) = 0; jpayne@69: jpayne@69: for (k = e; k > 0; k --) jpayne@69: { jpayne@69: from = d; jpayne@69: max = 1 + Edit_Array [k - 1] [d]; jpayne@69: if ((j = Edit_Array [k - 1] [d - 1]) > max) jpayne@69: { jpayne@69: from = d - 1; jpayne@69: max = j; jpayne@69: } jpayne@69: if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max) jpayne@69: { jpayne@69: from = d + 1; jpayne@69: max = j; jpayne@69: } jpayne@69: if (from == d - 1) jpayne@69: { jpayne@69: delta_stack [(* delta_len) ++] = max - last - 1; jpayne@69: d --; jpayne@69: last = Edit_Array [k - 1] [from]; jpayne@69: } jpayne@69: else if (from == d + 1) jpayne@69: { jpayne@69: delta_stack [(* delta_len) ++] = last - (max - 1); jpayne@69: d ++; jpayne@69: last = Edit_Array [k - 1] [from]; jpayne@69: } jpayne@69: } jpayne@69: delta_stack [(* delta_len) ++] = last + 1; jpayne@69: jpayne@69: k = 0; jpayne@69: for (i = (* delta_len) - 1; i > 0; i --) jpayne@69: delta [k ++] jpayne@69: = abs (delta_stack [i]) * Sign (delta_stack [i - 1]); jpayne@69: (* delta_len) --; jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: static void Set_Left_Pad jpayne@69: (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len) jpayne@69: jpayne@69: // Set pad to a triple of strings, each of length pad_len , jpayne@69: // representing a MUM of length mum_len ending at r_hi in Ref jpayne@69: // and q_hi in Query . Each string in pad must have been allocated jpayne@69: // at least pad_len + 1 bytes of memory. The purpose is to serve as jpayne@69: // the left-end padding for an alignment with this MUM as its left jpayne@69: // boundary. jpayne@69: jpayne@69: { jpayne@69: int i; jpayne@69: jpayne@69: for (i = 0; i < 3; i ++) jpayne@69: pad [i] [pad_len] = '\0'; jpayne@69: jpayne@69: for (i = 0; i < pad_len; i ++) jpayne@69: { jpayne@69: pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.'; jpayne@69: pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.'; jpayne@69: pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' '; jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: static void Set_Right_Pad jpayne@69: (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len) jpayne@69: jpayne@69: // Set pad to a triple of strings, each of length pad_len , jpayne@69: // representing a MUM of length mum_len beginning at r_lo in Ref jpayne@69: // and q_lo in Query . Each string in pad must have been allocated jpayne@69: // at least pad_len + 1 bytes of memory. The purpose is to serve as jpayne@69: // the right-end padding for an alignment with this MUM as its right jpayne@69: // boundary. jpayne@69: jpayne@69: { jpayne@69: int i; jpayne@69: jpayne@69: for (i = 0; i < 3; i ++) jpayne@69: pad [i] [pad_len] = '\0'; jpayne@69: jpayne@69: for (i = 0; i < pad_len; i ++) jpayne@69: { jpayne@69: pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.'; jpayne@69: pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.'; jpayne@69: pad [2] [i] = i < mum_len ? '=' : ' '; jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Show_Coverage jpayne@69: (Cover_t * list, char * filename, char * tag, char * seq) jpayne@69: jpayne@69: // Print to stderr summary stats of regions in list . jpayne@69: // Send to filename a list of region info suitable for celagram . jpayne@69: // Use tag to print the headings. Regions refer to seq . jpayne@69: jpayne@69: { jpayne@69: FILE * fp; jpayne@69: Cover_t * p; jpayne@69: int ct = 0; jpayne@69: long int i, prev_hi = 0, n_ct, len, seq_len; jpayne@69: long int cov_ns = 0, gap_ns = 0, total_ns; jpayne@69: double cov_total = 0.0, gap_total = 0.0; jpayne@69: jpayne@69: fp = Local_File_Open (filename, "w"); jpayne@69: fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n", jpayne@69: "Region", "Start", "End", "Len", "N's", "%N's"); jpayne@69: jpayne@69: for (p = list; p != NULL; p = p -> next) jpayne@69: { jpayne@69: n_ct = 0; jpayne@69: for (i = prev_hi + 1; i < p -> lo; i ++) jpayne@69: if (strchr ("acgt", seq [i]) == NULL) jpayne@69: n_ct ++; jpayne@69: len = p -> lo - prev_hi - 1; jpayne@69: fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", jpayne@69: ct, prev_hi + 1, p -> lo - 1, len, n_ct, jpayne@69: len == 0 ? 0.0 : 100.0 * n_ct / len); jpayne@69: gap_total += len; jpayne@69: gap_ns += n_ct; jpayne@69: jpayne@69: ct ++; jpayne@69: jpayne@69: n_ct = 0; jpayne@69: for (i = p -> lo; i <= p -> hi; i ++) jpayne@69: if (strchr ("acgt", seq [i]) == NULL) jpayne@69: n_ct ++; jpayne@69: len = 1 + p -> hi - p -> lo; jpayne@69: fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", jpayne@69: ct, p -> lo, p -> hi, len, n_ct, jpayne@69: len == 0 ? 0.0 : 100.0 * n_ct / len); jpayne@69: cov_total += len; jpayne@69: cov_ns += n_ct; jpayne@69: jpayne@69: prev_hi = p -> hi; jpayne@69: } jpayne@69: jpayne@69: n_ct = 0; jpayne@69: seq_len = strlen (seq + 1); jpayne@69: for (i = prev_hi + 1; i <= seq_len; i ++) jpayne@69: if (strchr ("acgt", seq [i]) == NULL) jpayne@69: n_ct ++; jpayne@69: len = seq_len - prev_hi; jpayne@69: fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n", jpayne@69: ct, prev_hi + 1, seq_len, len, n_ct, jpayne@69: len == 0 ? 0.0 : 100.0 * n_ct / len); jpayne@69: gap_total += len; jpayne@69: gap_ns += n_ct; jpayne@69: jpayne@69: total_ns = cov_ns + gap_ns; jpayne@69: jpayne@69: fclose (fp); jpayne@69: jpayne@69: fprintf (stderr, "\n%s Sequence Coverage:\n", tag); jpayne@69: fprintf (stderr, " Sequence length = %ld\n", seq_len); jpayne@69: fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns); jpayne@69: fprintf (stderr, " Non acgt's = %ld\n", total_ns); jpayne@69: fprintf (stderr, " Number of regions = %d\n", ct); jpayne@69: fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", jpayne@69: cov_total, jpayne@69: seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len, jpayne@69: seq_len - total_ns == 0.0 ? 0.0 : jpayne@69: 100.0 * (cov_total - cov_ns) / (seq_len - total_ns)); jpayne@69: fprintf (stderr, " Avg match len = %.0f\n", jpayne@69: ct == 0 ? 0.0 : cov_total / ct); jpayne@69: fprintf (stderr, " N's in matches = %ld (%.1f%%)\n", jpayne@69: cov_ns, jpayne@69: cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total); jpayne@69: fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n", jpayne@69: gap_total, jpayne@69: seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len, jpayne@69: seq_len - total_ns == 0.0 ? 0.0 : jpayne@69: 100.0 * (gap_total - gap_ns) / (seq_len - total_ns)); jpayne@69: fprintf (stderr, " Avg gap len = %.0f\n", jpayne@69: gap_total / (1.0 + ct)); jpayne@69: fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n", jpayne@69: gap_ns, jpayne@69: gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total); jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Show_Diffs jpayne@69: (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct, jpayne@69: long int a_ref, long int b_ref) jpayne@69: jpayne@69: // Show (to stdout ) the differences jpayne@69: // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] jpayne@69: // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to jpayne@69: // the start of string a . b_ref is the offset to the start of jpayne@69: // string b . jpayne@69: jpayne@69: { jpayne@69: int i, j, k, m; jpayne@69: jpayne@69: i = j = 0; jpayne@69: for (k = 0; k < delta_ct; k ++) jpayne@69: { jpayne@69: for (m = 1; m < abs (delta [k]); m ++) jpayne@69: { jpayne@69: if (a [i] != b [j]) jpayne@69: printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: if (delta [k] < 0) jpayne@69: { jpayne@69: printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]); jpayne@69: j ++; jpayne@69: } jpayne@69: else jpayne@69: { jpayne@69: printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]); jpayne@69: i ++; jpayne@69: } jpayne@69: } jpayne@69: while (i < a_len && j < b_len) jpayne@69: { jpayne@69: if (a [i] != b [j]) jpayne@69: printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]); jpayne@69: i ++; jpayne@69: j ++; jpayne@69: } jpayne@69: jpayne@69: return; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: int Sign jpayne@69: (int a) jpayne@69: jpayne@69: // Return the algebraic sign of a . jpayne@69: jpayne@69: { jpayne@69: if (a > 0) jpayne@69: return 1; jpayne@69: else if (a < 0) jpayne@69: return -1; jpayne@69: jpayne@69: return 0; jpayne@69: } jpayne@69: jpayne@69: jpayne@69: jpayne@69: void Usage jpayne@69: (char * command) jpayne@69: jpayne@69: // Print to stderr description of options and command line for jpayne@69: // this program. command is the command that was used to jpayne@69: // invoke it. jpayne@69: jpayne@69: { jpayne@69: fprintf (stderr, jpayne@69: "USAGE: %s \n" jpayne@69: "\n" jpayne@69: "Combines MUMs in by extending matches off\n" jpayne@69: "ends and between MUMs. is a fasta file\n" jpayne@69: "of the reference sequence. is a\n" jpayne@69: "multi-fasta file of the sequences matched against the\n" jpayne@69: "reference\n" jpayne@69: "\n" jpayne@69: "Options:\n" jpayne@69: "-D Only output to stdout the difference positions\n" jpayne@69: " and characters\n" jpayne@69: "-n Allow matches only between nucleotides, i.e., ACGTs\n" jpayne@69: "-N num Break matches at or more consecutive non-ACGTs \n" jpayne@69: "-q tag Used to label query match\n" jpayne@69: "-r tag Used to label reference match\n" jpayne@69: "-S Output all differences in strings\n" jpayne@69: "-t Label query matches with query fasta header\n" jpayne@69: "-v num Set verbose level for extra output\n" jpayne@69: "-W file Reset the default output filename witherrors.gaps\n" jpayne@69: "-x Don't output .cover files\n" jpayne@69: "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n", jpayne@69: jpayne@69: command); jpayne@69: jpayne@69: return; jpayne@69: }