jpayne@69
|
1 //------------------------------------------------------------------------------
|
jpayne@69
|
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
|
jpayne@69
|
3 // File: postnuc.cc
|
jpayne@69
|
4 // Date: 07 / 16 / 2002
|
jpayne@69
|
5 //
|
jpayne@69
|
6 // Revision: 08 / 01 / 2002
|
jpayne@69
|
7 // Added MUM extension functionality
|
jpayne@69
|
8 //
|
jpayne@69
|
9 // Purpose: To translate the coordinates referencing the concatenated
|
jpayne@69
|
10 // reference sequences back to the individual sequences and deal
|
jpayne@69
|
11 // with any conflict that may have arisen (i.e. a MUM that spans
|
jpayne@69
|
12 // the boundry between two sequences). Then to extend each cluster
|
jpayne@69
|
13 // via Smith-Waterman techniques to expand the alignment coverage.
|
jpayne@69
|
14 // Alignments which encounter each other will be fused to form one
|
jpayne@69
|
15 // encompasing alignment when appropriate.
|
jpayne@69
|
16 //
|
jpayne@69
|
17 // Input: Input is the output of the .mgaps program from stdin. On the
|
jpayne@69
|
18 // command line, the file names of the two original sequence files
|
jpayne@69
|
19 // should come first, followed by the prefix <pfx> that should be
|
jpayne@69
|
20 // placed in front of the two output filenames <pfx>.cluster and
|
jpayne@69
|
21 // <pfx>.delta
|
jpayne@69
|
22 //
|
jpayne@69
|
23 // NOTE: Cluster file is now suppressed by default (see -d option).
|
jpayne@69
|
24 //
|
jpayne@69
|
25 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
|
jpayne@69
|
26 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
|
jpayne@69
|
27 // However, the clusters now reference their corresponding
|
jpayne@69
|
28 // sequence and are all listed under headers that specify this
|
jpayne@69
|
29 // sequence. The <pfx>.delta file is the alignment object that
|
jpayne@69
|
30 // contains all the information necessary to reproduce the alignments
|
jpayne@69
|
31 // generated by the MUM extension process. Please refer to the
|
jpayne@69
|
32 // output file documentation for an in-depth description of these
|
jpayne@69
|
33 // file formats.
|
jpayne@69
|
34 //
|
jpayne@69
|
35 // Usage: postnuc <reference> <query> <pfx> < <input>
|
jpayne@69
|
36 // Where <reference> and <query> are the original input sequences of
|
jpayne@69
|
37 // NUCmer and <pfx> is the prefix that should be added to the
|
jpayne@69
|
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
|
jpayne@69
|
39 //
|
jpayne@69
|
40 //------------------------------------------------------------------------------
|
jpayne@69
|
41
|
jpayne@69
|
42 //-- NOTE: this option will significantly hamper program performance,
|
jpayne@69
|
43 // mostly the alignment extension performance (sw_align.h)
|
jpayne@69
|
44 //#define _DEBUG_ASSERT // self testing assert functions
|
jpayne@69
|
45
|
jpayne@69
|
46 #include "tigrinc.hh"
|
jpayne@69
|
47 #include "sw_align.hh"
|
jpayne@69
|
48 #include <vector>
|
jpayne@69
|
49 #include <algorithm>
|
jpayne@69
|
50 using namespace std;
|
jpayne@69
|
51
|
jpayne@69
|
52
|
jpayne@69
|
53 //------------------------------------------------------ Globals -------------//
|
jpayne@69
|
54 const signed char FORWARD_CHAR = 1;
|
jpayne@69
|
55 const signed char REVERSE_CHAR = -1;
|
jpayne@69
|
56
|
jpayne@69
|
57
|
jpayne@69
|
58 bool DO_DELTA = true;
|
jpayne@69
|
59 bool DO_EXTEND = true;
|
jpayne@69
|
60 bool TO_SEQEND = false;
|
jpayne@69
|
61 bool DO_SHADOWS = false;
|
jpayne@69
|
62
|
jpayne@69
|
63
|
jpayne@69
|
64 //------------------------------------------------------ Type Definitions ----//
|
jpayne@69
|
65 enum LineType
|
jpayne@69
|
66 //-- The type of input line from <stdin>
|
jpayne@69
|
67 {
|
jpayne@69
|
68 NO_LINE, HEADER_LINE, MATCH_LINE
|
jpayne@69
|
69 };
|
jpayne@69
|
70
|
jpayne@69
|
71
|
jpayne@69
|
72 struct FastaRecord
|
jpayne@69
|
73 //-- The essential data of a sequence
|
jpayne@69
|
74 {
|
jpayne@69
|
75 char * Id; // the fasta ID header tag
|
jpayne@69
|
76 long int len; // the length of the sequence
|
jpayne@69
|
77 char * seq; // the sequence data
|
jpayne@69
|
78 };
|
jpayne@69
|
79
|
jpayne@69
|
80
|
jpayne@69
|
81 struct Match
|
jpayne@69
|
82 //-- An exact match between two sequences A and B
|
jpayne@69
|
83 {
|
jpayne@69
|
84 long int sA, sB, len; // start coordinate in A, in B and the length
|
jpayne@69
|
85 };
|
jpayne@69
|
86
|
jpayne@69
|
87
|
jpayne@69
|
88 struct Cluster
|
jpayne@69
|
89 //-- An ordered list of matches between two sequences A and B
|
jpayne@69
|
90 {
|
jpayne@69
|
91 bool wasFused; // have the cluster matches been fused yet?
|
jpayne@69
|
92 signed char dirB; // the query sequence direction
|
jpayne@69
|
93 // FORWARD_CHAR or REVERSE_CHAR
|
jpayne@69
|
94 vector<Match> matches; // the ordered set of matches in the cluster
|
jpayne@69
|
95 };
|
jpayne@69
|
96
|
jpayne@69
|
97
|
jpayne@69
|
98 struct Synteny
|
jpayne@69
|
99 //-- An ordered list of clusters between two sequences A and B
|
jpayne@69
|
100 {
|
jpayne@69
|
101 FastaRecord * AfP; // a pointer to the reference sequence record
|
jpayne@69
|
102 FastaRecord Bf; // the query sequence record (w/o the sequence)
|
jpayne@69
|
103 vector<Cluster> clusters; // the ordered set of clusters between A and B
|
jpayne@69
|
104 };
|
jpayne@69
|
105
|
jpayne@69
|
106
|
jpayne@69
|
107 struct Alignment
|
jpayne@69
|
108 //-- An alignment object between two sequences A and B
|
jpayne@69
|
109 {
|
jpayne@69
|
110 signed char dirB; // the query sequence direction
|
jpayne@69
|
111 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
|
jpayne@69
|
112 vector<long int> delta; // the delta values, with NO zero at the end
|
jpayne@69
|
113 long int deltaApos; // sum of abs(deltas) - #of negative deltas
|
jpayne@69
|
114 // trust me, it is a very helpful value
|
jpayne@69
|
115 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
|
jpayne@69
|
116 };
|
jpayne@69
|
117
|
jpayne@69
|
118
|
jpayne@69
|
119 struct AscendingClusterSort
|
jpayne@69
|
120 //-- For sorting clusters in ascending order of their sA coordinate
|
jpayne@69
|
121 {
|
jpayne@69
|
122 bool operator() (const Cluster & pA, const Cluster & pB)
|
jpayne@69
|
123 {
|
jpayne@69
|
124 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
|
jpayne@69
|
125 }
|
jpayne@69
|
126 };
|
jpayne@69
|
127
|
jpayne@69
|
128
|
jpayne@69
|
129
|
jpayne@69
|
130
|
jpayne@69
|
131 //------------------------------------------------- Function Declarations ----//
|
jpayne@69
|
132 void addNewAlignment
|
jpayne@69
|
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
|
jpayne@69
|
134 vector<Match>::iterator Mp);
|
jpayne@69
|
135
|
jpayne@69
|
136 bool extendBackward
|
jpayne@69
|
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
|
jpayne@69
|
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
|
jpayne@69
|
139
|
jpayne@69
|
140 bool extendForward
|
jpayne@69
|
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
|
jpayne@69
|
142 const char * B, long int targetB, unsigned int m_o);
|
jpayne@69
|
143
|
jpayne@69
|
144 void extendClusters
|
jpayne@69
|
145 (vector<Cluster> & Clusters,
|
jpayne@69
|
146 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile);
|
jpayne@69
|
147
|
jpayne@69
|
148 void flushAlignments
|
jpayne@69
|
149 (vector<Alignment> & Alignments,
|
jpayne@69
|
150 const FastaRecord * Af, const FastaRecord * Bf,
|
jpayne@69
|
151 FILE * DeltaFile);
|
jpayne@69
|
152
|
jpayne@69
|
153 void flushSyntenys
|
jpayne@69
|
154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
|
jpayne@69
|
155
|
jpayne@69
|
156 vector<Cluster>::iterator getForwardTargetCluster
|
jpayne@69
|
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
158 long int & targetA, long int & targetB);
|
jpayne@69
|
159
|
jpayne@69
|
160 vector<Alignment>::iterator getReverseTargetAlignment
|
jpayne@69
|
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
|
jpayne@69
|
162
|
jpayne@69
|
163 bool isShadowedCluster
|
jpayne@69
|
164 (vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
|
jpayne@69
|
166
|
jpayne@69
|
167 void parseAbort
|
jpayne@69
|
168 (const char *);
|
jpayne@69
|
169
|
jpayne@69
|
170 void parseDelta
|
jpayne@69
|
171 (vector<Alignment> & Alignments,
|
jpayne@69
|
172 const FastaRecord * Af, const FastaRecord *Bf);
|
jpayne@69
|
173
|
jpayne@69
|
174 void processSyntenys
|
jpayne@69
|
175 (vector<Synteny> & Syntenys,
|
jpayne@69
|
176 FastaRecord * Af, long int As,
|
jpayne@69
|
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
|
jpayne@69
|
178
|
jpayne@69
|
179 inline long int revC
|
jpayne@69
|
180 (long int Coord, long int Len);
|
jpayne@69
|
181
|
jpayne@69
|
182 void printHelp
|
jpayne@69
|
183 (const char *);
|
jpayne@69
|
184
|
jpayne@69
|
185 void printUsage
|
jpayne@69
|
186 (const char *);
|
jpayne@69
|
187
|
jpayne@69
|
188 void validateData
|
jpayne@69
|
189 (vector<Alignment> Alignments, vector<Cluster> Clusters,
|
jpayne@69
|
190 const FastaRecord * Af, const FastaRecord * Bf);
|
jpayne@69
|
191
|
jpayne@69
|
192
|
jpayne@69
|
193
|
jpayne@69
|
194
|
jpayne@69
|
195 //-------------------------------------------------- Function Definitions ----//
|
jpayne@69
|
196 int main
|
jpayne@69
|
197 (int argc, char ** argv)
|
jpayne@69
|
198 {
|
jpayne@69
|
199 FastaRecord * Af; // array of all the reference sequences
|
jpayne@69
|
200
|
jpayne@69
|
201 vector<Synteny> Syntenys; // vector of all sets of clusters
|
jpayne@69
|
202 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
|
jpayne@69
|
203 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
|
jpayne@69
|
204
|
jpayne@69
|
205 Synteny Asyn; // a single set of clusters
|
jpayne@69
|
206 Cluster Aclu; // a single cluster of matches
|
jpayne@69
|
207 Match Amat; // a single match
|
jpayne@69
|
208
|
jpayne@69
|
209 LineType PrevLine; // the current input line
|
jpayne@69
|
210
|
jpayne@69
|
211 bool Found; // temporary search flag
|
jpayne@69
|
212
|
jpayne@69
|
213 char Line [MAX_LINE]; // a single line of input
|
jpayne@69
|
214 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
|
jpayne@69
|
215 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
|
jpayne@69
|
216 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
|
jpayne@69
|
217
|
jpayne@69
|
218 signed char DirB = FORWARD_CHAR; // the current query strand direction
|
jpayne@69
|
219
|
jpayne@69
|
220 long int i; // temporary counter
|
jpayne@69
|
221 long int Seqi; // current reference sequence index
|
jpayne@69
|
222 long int InitSize; // initial sequence memory space
|
jpayne@69
|
223 long int As = 0; // the size of the reference array
|
jpayne@69
|
224 long int Ac = 100; // the capacity of the reference array
|
jpayne@69
|
225 long int sA, sB, len; // current match start in A, B and length
|
jpayne@69
|
226
|
jpayne@69
|
227 FILE * RefFile, * QryFile; // reference and query input files
|
jpayne@69
|
228 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
|
jpayne@69
|
229
|
jpayne@69
|
230 //-- Set the alignment data type and break length (sw_align.h)
|
jpayne@69
|
231 setMatrixType ( NUCLEOTIDE );
|
jpayne@69
|
232 setBreakLen ( 200 );
|
jpayne@69
|
233 setBanding ( 0 );
|
jpayne@69
|
234
|
jpayne@69
|
235 //-- Parse the command line arguments
|
jpayne@69
|
236 {
|
jpayne@69
|
237 optarg = NULL;
|
jpayne@69
|
238 int ch, errflg = 0;
|
jpayne@69
|
239 while ( !errflg && ((ch = getopt (argc, argv, "dehB:b:st")) != EOF) )
|
jpayne@69
|
240 switch (ch)
|
jpayne@69
|
241 {
|
jpayne@69
|
242 case 'b' :
|
jpayne@69
|
243 setBreakLen( atoi (optarg) );
|
jpayne@69
|
244 break;
|
jpayne@69
|
245
|
jpayne@69
|
246 case 'B' :
|
jpayne@69
|
247 setBanding( atoi (optarg) );
|
jpayne@69
|
248 break;
|
jpayne@69
|
249
|
jpayne@69
|
250 case 'd' :
|
jpayne@69
|
251 DO_DELTA = false;
|
jpayne@69
|
252 break;
|
jpayne@69
|
253
|
jpayne@69
|
254 case 'e' :
|
jpayne@69
|
255 DO_EXTEND = false;
|
jpayne@69
|
256 break;
|
jpayne@69
|
257
|
jpayne@69
|
258 case 'h' :
|
jpayne@69
|
259 printHelp (argv[0]);
|
jpayne@69
|
260 exit (EXIT_SUCCESS);
|
jpayne@69
|
261 break;
|
jpayne@69
|
262
|
jpayne@69
|
263 case 's' :
|
jpayne@69
|
264 DO_SHADOWS = true;
|
jpayne@69
|
265 break;
|
jpayne@69
|
266
|
jpayne@69
|
267 case 't' :
|
jpayne@69
|
268 TO_SEQEND = true;
|
jpayne@69
|
269 break;
|
jpayne@69
|
270
|
jpayne@69
|
271 default :
|
jpayne@69
|
272 errflg ++;
|
jpayne@69
|
273 }
|
jpayne@69
|
274
|
jpayne@69
|
275 if ( errflg > 0 || argc - optind != 3 )
|
jpayne@69
|
276 {
|
jpayne@69
|
277 printUsage (argv[0]);
|
jpayne@69
|
278 exit (EXIT_FAILURE);
|
jpayne@69
|
279 }
|
jpayne@69
|
280 }
|
jpayne@69
|
281
|
jpayne@69
|
282 //-- Read and create the I/O file names
|
jpayne@69
|
283 strcpy (RefFileName, argv[optind ++]);
|
jpayne@69
|
284 strcpy (QryFileName, argv[optind ++]);
|
jpayne@69
|
285 strcpy (ClusterFileName, argv[optind ++]);
|
jpayne@69
|
286 strcpy (DeltaFileName, ClusterFileName);
|
jpayne@69
|
287 strcat (ClusterFileName, ".cluster");
|
jpayne@69
|
288 strcat (DeltaFileName, ".delta");
|
jpayne@69
|
289
|
jpayne@69
|
290 //-- Open all the files
|
jpayne@69
|
291 RefFile = File_Open (RefFileName, "r");
|
jpayne@69
|
292 QryFile = File_Open (QryFileName, "r");
|
jpayne@69
|
293 if ( DO_DELTA )
|
jpayne@69
|
294 {
|
jpayne@69
|
295 ClusterFile = NULL;
|
jpayne@69
|
296 DeltaFile = File_Open (DeltaFileName, "w");
|
jpayne@69
|
297 }
|
jpayne@69
|
298 else
|
jpayne@69
|
299 {
|
jpayne@69
|
300 ClusterFile = File_Open (ClusterFileName, "w");
|
jpayne@69
|
301 DeltaFile = NULL;
|
jpayne@69
|
302 }
|
jpayne@69
|
303
|
jpayne@69
|
304 //-- Print the headers of the output files
|
jpayne@69
|
305 if ( DO_DELTA )
|
jpayne@69
|
306 fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
|
jpayne@69
|
307 else
|
jpayne@69
|
308 fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
|
jpayne@69
|
309
|
jpayne@69
|
310 //-- Generate the array of the reference sequences
|
jpayne@69
|
311 InitSize = INIT_SIZE;
|
jpayne@69
|
312 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
|
jpayne@69
|
313 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
314 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
|
jpayne@69
|
315 {
|
jpayne@69
|
316 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
|
jpayne@69
|
317 strcpy (Af[As].Id, IdA);
|
jpayne@69
|
318
|
jpayne@69
|
319 Af[As].len = strlen (Af[As].seq + 1);
|
jpayne@69
|
320
|
jpayne@69
|
321 if ( ++ As >= Ac )
|
jpayne@69
|
322 {
|
jpayne@69
|
323 Ac *= 2;
|
jpayne@69
|
324 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
|
jpayne@69
|
325 }
|
jpayne@69
|
326
|
jpayne@69
|
327 InitSize = INIT_SIZE;
|
jpayne@69
|
328 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
329 }
|
jpayne@69
|
330 fclose (RefFile);
|
jpayne@69
|
331 if ( As <= 0 )
|
jpayne@69
|
332 parseAbort ( RefFileName );
|
jpayne@69
|
333
|
jpayne@69
|
334
|
jpayne@69
|
335
|
jpayne@69
|
336 //-- Process the input from <stdin> line by line
|
jpayne@69
|
337 PrevLine = NO_LINE;
|
jpayne@69
|
338 IdA[0] = '\0';
|
jpayne@69
|
339 IdB[0] = '\0';
|
jpayne@69
|
340 CurrIdB[0] = '\0';
|
jpayne@69
|
341 while ( fgets(Line, MAX_LINE, stdin) != NULL )
|
jpayne@69
|
342 {
|
jpayne@69
|
343
|
jpayne@69
|
344 //-- If the current line is a fasta HEADER_LINE
|
jpayne@69
|
345 if ( Line[0] == '>' )
|
jpayne@69
|
346 {
|
jpayne@69
|
347 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
|
jpayne@69
|
348 parseAbort ("stdin");
|
jpayne@69
|
349 DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR;
|
jpayne@69
|
350
|
jpayne@69
|
351 PrevLine = HEADER_LINE;
|
jpayne@69
|
352 }
|
jpayne@69
|
353
|
jpayne@69
|
354
|
jpayne@69
|
355 //-- If the current line is a cluster HEADER_LINE
|
jpayne@69
|
356 else if ( Line[0] == '#' )
|
jpayne@69
|
357 {
|
jpayne@69
|
358 PrevLine = HEADER_LINE;
|
jpayne@69
|
359 }
|
jpayne@69
|
360
|
jpayne@69
|
361
|
jpayne@69
|
362 //-- If the current line is a MATCH_LINE
|
jpayne@69
|
363 else
|
jpayne@69
|
364 {
|
jpayne@69
|
365 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
|
jpayne@69
|
366 parseAbort ("stdin");
|
jpayne@69
|
367
|
jpayne@69
|
368 //-- Re-map the reference coordinate back to its original sequence
|
jpayne@69
|
369 for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ )
|
jpayne@69
|
370 sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq
|
jpayne@69
|
371 if ( Seqi >= As )
|
jpayne@69
|
372 {
|
jpayne@69
|
373 fprintf (stderr,
|
jpayne@69
|
374 "ERROR: A MUM was found with a start coordinate greater than\n"
|
jpayne@69
|
375 " the sequence length, a serious error has occured.\n"
|
jpayne@69
|
376 " Please file a bug report\n");
|
jpayne@69
|
377 exit (EXIT_FAILURE);
|
jpayne@69
|
378 }
|
jpayne@69
|
379
|
jpayne@69
|
380 //-- If the match spans across a sequence boundry
|
jpayne@69
|
381 if ( sA + len - 1 > Af[Seqi].len || sA <= 0)
|
jpayne@69
|
382 {
|
jpayne@69
|
383 fprintf (stderr,
|
jpayne@69
|
384 "WARNING: A MUM was found extending beyond the boundry of:\n"
|
jpayne@69
|
385 " Reference sequence '>%s'\n\n"
|
jpayne@69
|
386 "Please check that the '-n' option is activated on 'mummer2'\n"
|
jpayne@69
|
387 "and try again, or file a bug report\n"
|
jpayne@69
|
388 "Attempting to continue.\n", Af[Seqi].Id);
|
jpayne@69
|
389 continue;
|
jpayne@69
|
390 }
|
jpayne@69
|
391
|
jpayne@69
|
392 //-- Check and update the current synteny region
|
jpayne@69
|
393 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 )
|
jpayne@69
|
394 {
|
jpayne@69
|
395 Found = false;
|
jpayne@69
|
396 if ( strcmp (IdB, CurrIdB) == 0 )
|
jpayne@69
|
397 {
|
jpayne@69
|
398 //-- Has this header been seen before?
|
jpayne@69
|
399 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
|
jpayne@69
|
400 {
|
jpayne@69
|
401 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
|
jpayne@69
|
402 {
|
jpayne@69
|
403 if ( Sp->AfP->len != Af[Seqi].len )
|
jpayne@69
|
404 {
|
jpayne@69
|
405 fprintf (stderr,
|
jpayne@69
|
406 "ERROR: The reference file may contain"
|
jpayne@69
|
407 " sequences with non-unique\n"
|
jpayne@69
|
408 " header Ids, please check your input"
|
jpayne@69
|
409 " files and try again\n");
|
jpayne@69
|
410 exit (EXIT_FAILURE);
|
jpayne@69
|
411 }
|
jpayne@69
|
412 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
|
jpayne@69
|
413 CurrSp = Sp;
|
jpayne@69
|
414 Found = true;
|
jpayne@69
|
415 break;
|
jpayne@69
|
416 }
|
jpayne@69
|
417 }
|
jpayne@69
|
418 }
|
jpayne@69
|
419 else
|
jpayne@69
|
420 {
|
jpayne@69
|
421 //-- New B sequence header, process all the old synteny's
|
jpayne@69
|
422 processSyntenys (Syntenys, Af, As,
|
jpayne@69
|
423 QryFile, ClusterFile, DeltaFile);
|
jpayne@69
|
424 }
|
jpayne@69
|
425
|
jpayne@69
|
426 strcpy (IdA, Af[Seqi].Id);
|
jpayne@69
|
427 strcpy (IdB, CurrIdB);
|
jpayne@69
|
428
|
jpayne@69
|
429 //-- If not seen yet, create a new synteny region
|
jpayne@69
|
430 if ( ! Found )
|
jpayne@69
|
431 {
|
jpayne@69
|
432 Asyn.AfP = Af + Seqi;
|
jpayne@69
|
433 Asyn.Bf.len = -1;
|
jpayne@69
|
434 Asyn.Bf.Id = (char *) Safe_malloc
|
jpayne@69
|
435 (sizeof(char) * (strlen(IdB) + 1));
|
jpayne@69
|
436 strcpy (Asyn.Bf.Id, IdB);
|
jpayne@69
|
437
|
jpayne@69
|
438 Syntenys.push_back (Asyn);
|
jpayne@69
|
439 CurrSp = Syntenys.rbegin( );
|
jpayne@69
|
440 }
|
jpayne@69
|
441
|
jpayne@69
|
442 //-- Add a new cluster to the current synteny
|
jpayne@69
|
443 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
444 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
445 CurrSp->clusters.pop_back( ); // hack to remove empties
|
jpayne@69
|
446 Aclu.wasFused = false;
|
jpayne@69
|
447 Aclu.dirB = DirB;
|
jpayne@69
|
448 CurrSp->clusters.push_back (Aclu);
|
jpayne@69
|
449 }
|
jpayne@69
|
450 else if ( PrevLine == HEADER_LINE )
|
jpayne@69
|
451 {
|
jpayne@69
|
452 //-- Add a new cluster to the current synteny
|
jpayne@69
|
453 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
454 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
455 CurrSp->clusters.pop_back( );
|
jpayne@69
|
456 Aclu.wasFused = false;
|
jpayne@69
|
457 Aclu.dirB = DirB;
|
jpayne@69
|
458 CurrSp->clusters.push_back (Aclu);
|
jpayne@69
|
459 }
|
jpayne@69
|
460
|
jpayne@69
|
461 //-- Add a new match to the current cluster
|
jpayne@69
|
462 if ( len > 1 )
|
jpayne@69
|
463 {
|
jpayne@69
|
464 Amat.sA = sA;
|
jpayne@69
|
465 Amat.sB = sB;
|
jpayne@69
|
466 Amat.len = len;
|
jpayne@69
|
467 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
|
jpayne@69
|
468 }
|
jpayne@69
|
469
|
jpayne@69
|
470 PrevLine = MATCH_LINE;
|
jpayne@69
|
471 }
|
jpayne@69
|
472 }
|
jpayne@69
|
473
|
jpayne@69
|
474 //-- Process the left-over syntenys
|
jpayne@69
|
475 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
476 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
477 CurrSp->clusters.pop_back( );
|
jpayne@69
|
478
|
jpayne@69
|
479 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
|
jpayne@69
|
480 fclose (QryFile);
|
jpayne@69
|
481
|
jpayne@69
|
482 //-- Free the reference sequences
|
jpayne@69
|
483 for ( i = 0; i < As; i ++ )
|
jpayne@69
|
484 {
|
jpayne@69
|
485 free (Af[i].Id);
|
jpayne@69
|
486 free (Af[i].seq);
|
jpayne@69
|
487 }
|
jpayne@69
|
488 free (Af);
|
jpayne@69
|
489
|
jpayne@69
|
490 return EXIT_SUCCESS;
|
jpayne@69
|
491 }
|
jpayne@69
|
492
|
jpayne@69
|
493
|
jpayne@69
|
494
|
jpayne@69
|
495
|
jpayne@69
|
496 void addNewAlignment
|
jpayne@69
|
497 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
|
jpayne@69
|
498 vector<Match>::iterator Mp)
|
jpayne@69
|
499
|
jpayne@69
|
500 // Create and add a new alignment object based on the current match
|
jpayne@69
|
501 // and cluster information pointed to by Cp and Mp.
|
jpayne@69
|
502
|
jpayne@69
|
503 {
|
jpayne@69
|
504 Alignment Align;
|
jpayne@69
|
505
|
jpayne@69
|
506 //-- Initialize the data
|
jpayne@69
|
507 Align.sA = Mp->sA;
|
jpayne@69
|
508 Align.sB = Mp->sB;
|
jpayne@69
|
509 Align.eA = Mp->sA + Mp->len - 1;
|
jpayne@69
|
510 Align.eB = Mp->sB + Mp->len - 1;
|
jpayne@69
|
511 Align.dirB = Cp->dirB;
|
jpayne@69
|
512 Align.delta.clear( );
|
jpayne@69
|
513 Align.deltaApos = 0;
|
jpayne@69
|
514
|
jpayne@69
|
515 //-- Add the alignment object
|
jpayne@69
|
516 Alignments.push_back (Align);
|
jpayne@69
|
517
|
jpayne@69
|
518 return;
|
jpayne@69
|
519 }
|
jpayne@69
|
520
|
jpayne@69
|
521
|
jpayne@69
|
522
|
jpayne@69
|
523
|
jpayne@69
|
524 bool extendBackward
|
jpayne@69
|
525 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
|
jpayne@69
|
526 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
|
jpayne@69
|
527
|
jpayne@69
|
528 // Extend an alignment backwards off of the current alignment object.
|
jpayne@69
|
529 // The current alignment object must be freshly created and consist
|
jpayne@69
|
530 // only of an exact match (i.e. the delta vector MUST be empty).
|
jpayne@69
|
531 // If the TargetAp alignment object is reached by the extension, it will
|
jpayne@69
|
532 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
|
jpayne@69
|
533 // NULL the function will extend as far as possible. It is a strange
|
jpayne@69
|
534 // and dangerous function because it can delete CurrAp, so edit with
|
jpayne@69
|
535 // caution. Returns true if TargetAp was reached and merged, else false
|
jpayne@69
|
536 // Designed only as a subroutine for extendClusters, should be used
|
jpayne@69
|
537 // nowhere else.
|
jpayne@69
|
538
|
jpayne@69
|
539 {
|
jpayne@69
|
540 bool target_reached = false;
|
jpayne@69
|
541 bool overflow_flag = false;
|
jpayne@69
|
542 bool double_flag = false;
|
jpayne@69
|
543
|
jpayne@69
|
544 vector<long int>::iterator Dp;
|
jpayne@69
|
545
|
jpayne@69
|
546 unsigned int m_o;
|
jpayne@69
|
547 long int targetA, targetB;
|
jpayne@69
|
548
|
jpayne@69
|
549 m_o = BACKWARD_SEARCH;
|
jpayne@69
|
550
|
jpayne@69
|
551 //-- Set the target coordinates
|
jpayne@69
|
552 if ( TargetAp != Alignments.end( ) )
|
jpayne@69
|
553 {
|
jpayne@69
|
554 targetA = TargetAp->eA;
|
jpayne@69
|
555 targetB = TargetAp->eB;
|
jpayne@69
|
556 }
|
jpayne@69
|
557 else
|
jpayne@69
|
558 {
|
jpayne@69
|
559 targetA = 1;
|
jpayne@69
|
560 targetB = 1;
|
jpayne@69
|
561 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
562 }
|
jpayne@69
|
563
|
jpayne@69
|
564 //-- If alignment is too long, bring with bounds and set overflow_flag true
|
jpayne@69
|
565 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
566 {
|
jpayne@69
|
567 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
|
jpayne@69
|
568 overflow_flag = true;
|
jpayne@69
|
569 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
570 }
|
jpayne@69
|
571 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
572 {
|
jpayne@69
|
573 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
|
jpayne@69
|
574 if ( overflow_flag )
|
jpayne@69
|
575 double_flag = true;
|
jpayne@69
|
576 else
|
jpayne@69
|
577 overflow_flag = true;
|
jpayne@69
|
578 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
579 }
|
jpayne@69
|
580
|
jpayne@69
|
581
|
jpayne@69
|
582 if ( TO_SEQEND && !double_flag )
|
jpayne@69
|
583 m_o |= SEQEND_BIT;
|
jpayne@69
|
584
|
jpayne@69
|
585
|
jpayne@69
|
586 //-- Attempt to reach the target
|
jpayne@69
|
587 target_reached = alignSearch (A, CurrAp->sA, targetA,
|
jpayne@69
|
588 B, CurrAp->sB, targetB, m_o);
|
jpayne@69
|
589
|
jpayne@69
|
590 if ( overflow_flag || TargetAp == Alignments.end( ) )
|
jpayne@69
|
591 target_reached = false;
|
jpayne@69
|
592
|
jpayne@69
|
593 if ( target_reached )
|
jpayne@69
|
594 {
|
jpayne@69
|
595 //-- Merge the two alignment objects
|
jpayne@69
|
596 extendForward (TargetAp, A, CurrAp->sA,
|
jpayne@69
|
597 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
|
jpayne@69
|
598 TargetAp->eA = CurrAp->eA;
|
jpayne@69
|
599 TargetAp->eB = CurrAp->eB;
|
jpayne@69
|
600 Alignments.pop_back( );
|
jpayne@69
|
601 }
|
jpayne@69
|
602 else
|
jpayne@69
|
603 {
|
jpayne@69
|
604 alignTarget (A, targetA, CurrAp->sA,
|
jpayne@69
|
605 B, targetB, CurrAp->sB,
|
jpayne@69
|
606 CurrAp->delta, FORCED_FORWARD_ALIGN);
|
jpayne@69
|
607 CurrAp->sA = targetA;
|
jpayne@69
|
608 CurrAp->sB = targetB;
|
jpayne@69
|
609
|
jpayne@69
|
610 //-- Update the deltaApos value for the alignment object
|
jpayne@69
|
611 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
|
jpayne@69
|
612 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
|
jpayne@69
|
613 }
|
jpayne@69
|
614
|
jpayne@69
|
615 return target_reached;
|
jpayne@69
|
616 }
|
jpayne@69
|
617
|
jpayne@69
|
618
|
jpayne@69
|
619
|
jpayne@69
|
620
|
jpayne@69
|
621 bool extendForward
|
jpayne@69
|
622 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
|
jpayne@69
|
623 const char * B, long int targetB, unsigned int m_o)
|
jpayne@69
|
624
|
jpayne@69
|
625 // Extend an alignment forwards off the current alignment object until
|
jpayne@69
|
626 // target or end of sequence is reached, and merge the delta values of the
|
jpayne@69
|
627 // alignment object with the new delta values generated by the extension.
|
jpayne@69
|
628 // Return true if the target was reached, else false
|
jpayne@69
|
629
|
jpayne@69
|
630 {
|
jpayne@69
|
631 long int ValA;
|
jpayne@69
|
632 long int ValB;
|
jpayne@69
|
633 unsigned int Di;
|
jpayne@69
|
634 bool target_reached;
|
jpayne@69
|
635 bool overflow_flag = false;
|
jpayne@69
|
636 bool double_flag = false;
|
jpayne@69
|
637 vector<long int>::iterator Dp;
|
jpayne@69
|
638
|
jpayne@69
|
639 //-- Set Di to the end of the delta vector
|
jpayne@69
|
640 Di = CurrAp->delta.size( );
|
jpayne@69
|
641
|
jpayne@69
|
642 //-- Calculate the distance between the start and end positions
|
jpayne@69
|
643 ValA = targetA - CurrAp->eA + 1;
|
jpayne@69
|
644 ValB = targetB - CurrAp->eB + 1;
|
jpayne@69
|
645
|
jpayne@69
|
646 //-- If the distance is too long, shrink it and set the overflow_flag
|
jpayne@69
|
647 if ( ValA > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
648 {
|
jpayne@69
|
649 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
|
jpayne@69
|
650 overflow_flag = true;
|
jpayne@69
|
651 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
652 }
|
jpayne@69
|
653 if ( ValB > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
654 {
|
jpayne@69
|
655 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
|
jpayne@69
|
656 if ( overflow_flag )
|
jpayne@69
|
657 double_flag = true;
|
jpayne@69
|
658 else
|
jpayne@69
|
659 overflow_flag = true;
|
jpayne@69
|
660 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
661 }
|
jpayne@69
|
662
|
jpayne@69
|
663 if ( double_flag )
|
jpayne@69
|
664 m_o &= ~SEQEND_BIT;
|
jpayne@69
|
665
|
jpayne@69
|
666 target_reached = alignTarget (A, CurrAp->eA, targetA,
|
jpayne@69
|
667 B, CurrAp->eB, targetB,
|
jpayne@69
|
668 CurrAp->delta, m_o);
|
jpayne@69
|
669
|
jpayne@69
|
670 //-- Notify user if alignment was chopped short
|
jpayne@69
|
671 if ( target_reached && overflow_flag )
|
jpayne@69
|
672 target_reached = false;
|
jpayne@69
|
673
|
jpayne@69
|
674 //-- Pick up delta where left off (Di) and merge with new deltas
|
jpayne@69
|
675 if ( Di < CurrAp->delta.size( ) )
|
jpayne@69
|
676 {
|
jpayne@69
|
677 //-- Merge the deltas
|
jpayne@69
|
678 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
|
jpayne@69
|
679 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
|
jpayne@69
|
680 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
|
jpayne@69
|
681 {
|
jpayne@69
|
682 fprintf(stderr,
|
jpayne@69
|
683 "ERROR: failed to merge alignments at position %ld\n"
|
jpayne@69
|
684 " Please file a bug report\n",
|
jpayne@69
|
685 CurrAp->eA);
|
jpayne@69
|
686 exit (EXIT_FAILURE);
|
jpayne@69
|
687 }
|
jpayne@69
|
688
|
jpayne@69
|
689 //-- Update the deltaApos
|
jpayne@69
|
690 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
|
jpayne@69
|
691 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
|
jpayne@69
|
692 }
|
jpayne@69
|
693
|
jpayne@69
|
694 //-- Set the alignment coordinates
|
jpayne@69
|
695 CurrAp->eA = targetA;
|
jpayne@69
|
696 CurrAp->eB = targetB;
|
jpayne@69
|
697
|
jpayne@69
|
698 return target_reached;
|
jpayne@69
|
699 }
|
jpayne@69
|
700
|
jpayne@69
|
701
|
jpayne@69
|
702
|
jpayne@69
|
703
|
jpayne@69
|
704 void extendClusters
|
jpayne@69
|
705 (vector<Cluster> & Clusters,
|
jpayne@69
|
706 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
|
jpayne@69
|
707
|
jpayne@69
|
708 // Connect all the matches in every cluster between sequences A and B.
|
jpayne@69
|
709 // Also, extend alignments off of the front and back of each cluster to
|
jpayne@69
|
710 // expand total alignment coverage. When these extensions encounter an
|
jpayne@69
|
711 // adjacent cluster, fuse the two regions to create one single
|
jpayne@69
|
712 // encompassing region. This routine will create alignment objects from
|
jpayne@69
|
713 // these extensions and output the resulting delta information to the
|
jpayne@69
|
714 // delta output file.
|
jpayne@69
|
715
|
jpayne@69
|
716 {
|
jpayne@69
|
717 //-- Sort the clusters (ascending) by their start coordinate in sequence A
|
jpayne@69
|
718 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
|
jpayne@69
|
719
|
jpayne@69
|
720
|
jpayne@69
|
721 //-- If no delta file is requested
|
jpayne@69
|
722 if ( ! DO_DELTA )
|
jpayne@69
|
723 return;
|
jpayne@69
|
724
|
jpayne@69
|
725
|
jpayne@69
|
726 bool target_reached = false; // reached the adjacent match or cluster
|
jpayne@69
|
727
|
jpayne@69
|
728 char * A, * B; // the sequences A and B
|
jpayne@69
|
729 char * Brev = NULL; // the reverse complement of B
|
jpayne@69
|
730
|
jpayne@69
|
731 unsigned int m_o;
|
jpayne@69
|
732 long int targetA, targetB; // alignment extension targets in A and B
|
jpayne@69
|
733
|
jpayne@69
|
734 vector<Match>::iterator Mp; // match pointer
|
jpayne@69
|
735
|
jpayne@69
|
736 vector<Cluster>::iterator PrevCp; // where the extensions last left off
|
jpayne@69
|
737 vector<Cluster>::iterator CurrCp; // the current cluster being extended
|
jpayne@69
|
738 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
|
jpayne@69
|
739
|
jpayne@69
|
740 vector<Alignment> Alignments; // the vector of alignment objects
|
jpayne@69
|
741 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
|
jpayne@69
|
742 vector<Alignment>::iterator TargetAp; // target align
|
jpayne@69
|
743
|
jpayne@69
|
744
|
jpayne@69
|
745 //-- Extend each cluster
|
jpayne@69
|
746 A = Af->seq;
|
jpayne@69
|
747 PrevCp = Clusters.begin( );
|
jpayne@69
|
748 CurrCp = Clusters.begin( );
|
jpayne@69
|
749 while ( CurrCp < Clusters.end( ) )
|
jpayne@69
|
750 {
|
jpayne@69
|
751 if ( DO_EXTEND )
|
jpayne@69
|
752 {
|
jpayne@69
|
753 //-- Ignore if shadowed or already extended
|
jpayne@69
|
754 if ( ! target_reached )
|
jpayne@69
|
755 if ( CurrCp->wasFused
|
jpayne@69
|
756 ||
|
jpayne@69
|
757 (!DO_SHADOWS &&
|
jpayne@69
|
758 isShadowedCluster (CurrCp, Alignments, CurrAp)) )
|
jpayne@69
|
759 {
|
jpayne@69
|
760 CurrCp->wasFused = true;
|
jpayne@69
|
761 CurrCp = ++ PrevCp;
|
jpayne@69
|
762 continue;
|
jpayne@69
|
763 }
|
jpayne@69
|
764 }
|
jpayne@69
|
765
|
jpayne@69
|
766 //-- Pick the right directional sequence for B
|
jpayne@69
|
767 if ( CurrCp->dirB == FORWARD_CHAR )
|
jpayne@69
|
768 B = Bf->seq;
|
jpayne@69
|
769 else if ( Brev != NULL )
|
jpayne@69
|
770 B = Brev;
|
jpayne@69
|
771 else
|
jpayne@69
|
772 {
|
jpayne@69
|
773 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
|
jpayne@69
|
774 strcpy ( Brev + 1, Bf->seq + 1 );
|
jpayne@69
|
775 Brev[0] = '\0';
|
jpayne@69
|
776 Reverse_Complement (Brev, 1, Bf->len);
|
jpayne@69
|
777 B = Brev;
|
jpayne@69
|
778 }
|
jpayne@69
|
779
|
jpayne@69
|
780 //-- Extend each match in the cluster
|
jpayne@69
|
781 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
|
jpayne@69
|
782 {
|
jpayne@69
|
783 //-- Try to extend the current match backwards
|
jpayne@69
|
784 if ( target_reached )
|
jpayne@69
|
785 {
|
jpayne@69
|
786 //-- Merge with the previous match
|
jpayne@69
|
787 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
|
jpayne@69
|
788 {
|
jpayne@69
|
789 if ( Mp >= CurrCp->matches.end( ) - 1 )
|
jpayne@69
|
790 {
|
jpayne@69
|
791 fprintf (stderr,
|
jpayne@69
|
792 "ERROR: Target match does not exist, please\n"
|
jpayne@69
|
793 " file a bug report\n");
|
jpayne@69
|
794 exit (EXIT_FAILURE);
|
jpayne@69
|
795 }
|
jpayne@69
|
796 continue;
|
jpayne@69
|
797 }
|
jpayne@69
|
798 CurrAp->eA += Mp->len - 1;
|
jpayne@69
|
799 CurrAp->eB += Mp->len - 1;
|
jpayne@69
|
800 }
|
jpayne@69
|
801 else
|
jpayne@69
|
802 {
|
jpayne@69
|
803 //-- Create a new alignment object
|
jpayne@69
|
804 addNewAlignment (Alignments, CurrCp, Mp);
|
jpayne@69
|
805 CurrAp = Alignments.end( ) - 1;
|
jpayne@69
|
806
|
jpayne@69
|
807 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
|
jpayne@69
|
808 {
|
jpayne@69
|
809 //-- Target the closest/best alignment object
|
jpayne@69
|
810 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
|
jpayne@69
|
811
|
jpayne@69
|
812 //-- Extend the new alignment object backwards
|
jpayne@69
|
813 if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) )
|
jpayne@69
|
814 CurrAp = TargetAp;
|
jpayne@69
|
815 }
|
jpayne@69
|
816 }
|
jpayne@69
|
817
|
jpayne@69
|
818 m_o = FORWARD_ALIGN;
|
jpayne@69
|
819
|
jpayne@69
|
820 //-- Try to extend the current match forwards
|
jpayne@69
|
821 if ( Mp < CurrCp->matches.end( ) - 1 )
|
jpayne@69
|
822 {
|
jpayne@69
|
823 //-- Target the next match in the cluster
|
jpayne@69
|
824 targetA = (Mp + 1)->sA;
|
jpayne@69
|
825 targetB = (Mp + 1)->sB;
|
jpayne@69
|
826
|
jpayne@69
|
827 //-- Extend the current alignment object forward
|
jpayne@69
|
828 target_reached = extendForward (CurrAp, A, targetA,
|
jpayne@69
|
829 B, targetB, m_o);
|
jpayne@69
|
830 }
|
jpayne@69
|
831 else if ( DO_EXTEND )
|
jpayne@69
|
832 {
|
jpayne@69
|
833 targetA = Af->len;
|
jpayne@69
|
834 targetB = Bf->len;
|
jpayne@69
|
835
|
jpayne@69
|
836 //-- Target the closest/best match in a future cluster
|
jpayne@69
|
837 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
|
jpayne@69
|
838 targetA, targetB);
|
jpayne@69
|
839 if ( TargetCp == Clusters.end( ) )
|
jpayne@69
|
840 {
|
jpayne@69
|
841 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
842 if ( TO_SEQEND )
|
jpayne@69
|
843 m_o |= SEQEND_BIT;
|
jpayne@69
|
844 }
|
jpayne@69
|
845
|
jpayne@69
|
846 //-- Extend the current alignment object forward
|
jpayne@69
|
847 target_reached = extendForward (CurrAp, A, targetA,
|
jpayne@69
|
848 B, targetB, m_o);
|
jpayne@69
|
849 }
|
jpayne@69
|
850 }
|
jpayne@69
|
851 if ( TargetCp == Clusters.end( ) )
|
jpayne@69
|
852 target_reached = false;
|
jpayne@69
|
853
|
jpayne@69
|
854 CurrCp->wasFused = true;
|
jpayne@69
|
855
|
jpayne@69
|
856 if ( target_reached == false )
|
jpayne@69
|
857 CurrCp = ++ PrevCp;
|
jpayne@69
|
858 else
|
jpayne@69
|
859 CurrCp = TargetCp;
|
jpayne@69
|
860 }
|
jpayne@69
|
861
|
jpayne@69
|
862 #ifdef _DEBUG_ASSERT
|
jpayne@69
|
863 validateData (Alignments, Clusters, Af, Bf);
|
jpayne@69
|
864 #endif
|
jpayne@69
|
865
|
jpayne@69
|
866 //-- Output the alignment data to the delta file
|
jpayne@69
|
867 flushAlignments (Alignments, Af, Bf, DeltaFile);
|
jpayne@69
|
868
|
jpayne@69
|
869 if ( Brev != NULL )
|
jpayne@69
|
870 free (Brev);
|
jpayne@69
|
871
|
jpayne@69
|
872 return;
|
jpayne@69
|
873 }
|
jpayne@69
|
874
|
jpayne@69
|
875
|
jpayne@69
|
876
|
jpayne@69
|
877
|
jpayne@69
|
878 void flushAlignments
|
jpayne@69
|
879 (vector<Alignment> & Alignments,
|
jpayne@69
|
880 const FastaRecord * Af, const FastaRecord * Bf,
|
jpayne@69
|
881 FILE * DeltaFile)
|
jpayne@69
|
882
|
jpayne@69
|
883 // Simply output the delta information stored in Alignments to the
|
jpayne@69
|
884 // given delta file. Free the memory used by Alignments once the
|
jpayne@69
|
885 // data is successfully output to the file.
|
jpayne@69
|
886
|
jpayne@69
|
887 {
|
jpayne@69
|
888 vector<Alignment>::iterator Ap; // alignment pointer
|
jpayne@69
|
889 vector<long int>::iterator Dp; // delta pointer
|
jpayne@69
|
890
|
jpayne@69
|
891 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
|
jpayne@69
|
892
|
jpayne@69
|
893 //-- Generate the error counts
|
jpayne@69
|
894 parseDelta (Alignments, Af, Bf);
|
jpayne@69
|
895
|
jpayne@69
|
896 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
897 {
|
jpayne@69
|
898 if ( Ap->dirB == FORWARD_CHAR )
|
jpayne@69
|
899 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
|
jpayne@69
|
900 Ap->sA, Ap->eA,
|
jpayne@69
|
901 Ap->sB, Ap->eB,
|
jpayne@69
|
902 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
|
jpayne@69
|
903 else
|
jpayne@69
|
904 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
|
jpayne@69
|
905 Ap->sA, Ap->eA,
|
jpayne@69
|
906 revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len),
|
jpayne@69
|
907 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
|
jpayne@69
|
908
|
jpayne@69
|
909 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
|
jpayne@69
|
910 fprintf (DeltaFile, "%ld\n", *Dp);
|
jpayne@69
|
911 fprintf (DeltaFile, "0\n");
|
jpayne@69
|
912
|
jpayne@69
|
913 Ap->delta.clear( );
|
jpayne@69
|
914 }
|
jpayne@69
|
915
|
jpayne@69
|
916 Alignments.clear( );
|
jpayne@69
|
917
|
jpayne@69
|
918 return;
|
jpayne@69
|
919 }
|
jpayne@69
|
920
|
jpayne@69
|
921
|
jpayne@69
|
922
|
jpayne@69
|
923
|
jpayne@69
|
924 void flushSyntenys
|
jpayne@69
|
925 (vector<Synteny> & Syntenys, FILE * ClusterFile)
|
jpayne@69
|
926
|
jpayne@69
|
927 // Simply output the synteny/cluster information generated by the mgaps
|
jpayne@69
|
928 // program. However, now the coordinates reference their appropriate
|
jpayne@69
|
929 // reference sequence, and the reference sequecne header is added to
|
jpayne@69
|
930 // the appropriate lines. Free the memory used by Syntenys once the
|
jpayne@69
|
931 // data is successfully output to the file.
|
jpayne@69
|
932
|
jpayne@69
|
933 {
|
jpayne@69
|
934 vector<Synteny>::iterator Sp; // synteny pointer
|
jpayne@69
|
935 vector<Cluster>::iterator Cp; // cluster pointer
|
jpayne@69
|
936 vector<Match>::iterator Mp; // match pointer
|
jpayne@69
|
937
|
jpayne@69
|
938 if ( ClusterFile )
|
jpayne@69
|
939 {
|
jpayne@69
|
940 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
|
jpayne@69
|
941 {
|
jpayne@69
|
942 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
|
jpayne@69
|
943 Sp->AfP->len, Sp->Bf.len);
|
jpayne@69
|
944 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
|
jpayne@69
|
945 {
|
jpayne@69
|
946 fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB);
|
jpayne@69
|
947 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
|
jpayne@69
|
948 {
|
jpayne@69
|
949 if ( Cp->dirB == FORWARD_CHAR )
|
jpayne@69
|
950 fprintf (ClusterFile, "%8ld %8ld %6ld",
|
jpayne@69
|
951 Mp->sA, Mp->sB, Mp->len);
|
jpayne@69
|
952 else
|
jpayne@69
|
953 fprintf (ClusterFile, "%8ld %8ld %6ld",
|
jpayne@69
|
954 Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len);
|
jpayne@69
|
955 if ( Mp != Cp->matches.begin( ) )
|
jpayne@69
|
956 fprintf (ClusterFile, "%6ld %6ld\n",
|
jpayne@69
|
957 Mp->sA - (Mp - 1)->sA - (Mp - 1)->len,
|
jpayne@69
|
958 Mp->sB - (Mp - 1)->sB - (Mp - 1)->len);
|
jpayne@69
|
959 else
|
jpayne@69
|
960 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
|
jpayne@69
|
961 }
|
jpayne@69
|
962 Cp->matches.clear( );
|
jpayne@69
|
963 }
|
jpayne@69
|
964 Sp->clusters.clear( );
|
jpayne@69
|
965 }
|
jpayne@69
|
966 }
|
jpayne@69
|
967 else
|
jpayne@69
|
968 {
|
jpayne@69
|
969 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
|
jpayne@69
|
970 {
|
jpayne@69
|
971 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
|
jpayne@69
|
972 {
|
jpayne@69
|
973 Cp->matches.clear( );
|
jpayne@69
|
974 }
|
jpayne@69
|
975 Sp->clusters.clear( );
|
jpayne@69
|
976 }
|
jpayne@69
|
977 }
|
jpayne@69
|
978
|
jpayne@69
|
979 Syntenys.clear( );
|
jpayne@69
|
980
|
jpayne@69
|
981 return;
|
jpayne@69
|
982 }
|
jpayne@69
|
983
|
jpayne@69
|
984
|
jpayne@69
|
985
|
jpayne@69
|
986
|
jpayne@69
|
987 vector<Cluster>::iterator getForwardTargetCluster
|
jpayne@69
|
988 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
989 long int & targetA, long int & targetB)
|
jpayne@69
|
990
|
jpayne@69
|
991 // Return the cluster that is most likely to successfully join (in a
|
jpayne@69
|
992 // forward direction) with the current cluster. The returned cluster
|
jpayne@69
|
993 // must contain 1 or more matches that are strictly greater than the end
|
jpayne@69
|
994 // of the current cluster. The targeted cluster must also be on a
|
jpayne@69
|
995 // diagonal close enough to the current cluster, so that a connection
|
jpayne@69
|
996 // could possibly be made by the alignment extender. Assumes clusters
|
jpayne@69
|
997 // have been sorted via AscendingClusterSort. Returns targeted cluster
|
jpayne@69
|
998 // and stores the target coordinates in targetA and targetB. If no
|
jpayne@69
|
999 // suitable cluster was found, the function will return NULL and target
|
jpayne@69
|
1000 // A and targetB will remain unchanged.
|
jpayne@69
|
1001
|
jpayne@69
|
1002 {
|
jpayne@69
|
1003 vector<Match>::iterator Mip; // match iteratrive pointer
|
jpayne@69
|
1004 vector<Cluster>::iterator Cp; // cluster pointer
|
jpayne@69
|
1005 vector<Cluster>::iterator Cip; // cluster iterative pointer
|
jpayne@69
|
1006 long int eA, eB; // possible target
|
jpayne@69
|
1007 long int greater, lesser; // gap sizes between two clusters
|
jpayne@69
|
1008 long int sA = CurrCp->matches.rbegin( )->sA +
|
jpayne@69
|
1009 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
|
jpayne@69
|
1010 long int sB = CurrCp->matches.rbegin( )->sB +
|
jpayne@69
|
1011 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
|
jpayne@69
|
1012
|
jpayne@69
|
1013 //-- End of sequences is the default target, set distance accordingly
|
jpayne@69
|
1014 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
|
jpayne@69
|
1015
|
jpayne@69
|
1016 //-- For all clusters greater than the current cluster (on sequence A)
|
jpayne@69
|
1017 Cp = Clusters.end( );
|
jpayne@69
|
1018 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
|
jpayne@69
|
1019 {
|
jpayne@69
|
1020 //-- If the cluster is on the same direction
|
jpayne@69
|
1021 if ( CurrCp->dirB == Cip->dirB )
|
jpayne@69
|
1022 {
|
jpayne@69
|
1023 eA = Cip->matches.begin( )->sA;
|
jpayne@69
|
1024 eB = Cip->matches.begin( )->sB;
|
jpayne@69
|
1025
|
jpayne@69
|
1026 //-- If the cluster overlaps the current cluster, strip some matches
|
jpayne@69
|
1027 if ( ( eA < sA || eB < sB ) &&
|
jpayne@69
|
1028 Cip->matches.rbegin( )->sA >= sA &&
|
jpayne@69
|
1029 Cip->matches.rbegin( )->sB >= sB )
|
jpayne@69
|
1030 {
|
jpayne@69
|
1031 for ( Mip = Cip->matches.begin( );
|
jpayne@69
|
1032 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
|
jpayne@69
|
1033 Mip ++ )
|
jpayne@69
|
1034 {
|
jpayne@69
|
1035 eA = Mip->sA;
|
jpayne@69
|
1036 eB = Mip->sB;
|
jpayne@69
|
1037 }
|
jpayne@69
|
1038 }
|
jpayne@69
|
1039
|
jpayne@69
|
1040 //-- If the cluster is strictly greater than current cluster
|
jpayne@69
|
1041 if ( eA >= sA && eB >= sB )
|
jpayne@69
|
1042 {
|
jpayne@69
|
1043 if ( eA - sA > eB - sB )
|
jpayne@69
|
1044 {
|
jpayne@69
|
1045 greater = eA - sA;
|
jpayne@69
|
1046 lesser = eB - sB;
|
jpayne@69
|
1047 }
|
jpayne@69
|
1048 else
|
jpayne@69
|
1049 {
|
jpayne@69
|
1050 lesser = eA - sA;
|
jpayne@69
|
1051 greater = eB - sB;
|
jpayne@69
|
1052 }
|
jpayne@69
|
1053
|
jpayne@69
|
1054 //-- If the cluster is close enough
|
jpayne@69
|
1055 if ( greater < getBreakLen( ) ||
|
jpayne@69
|
1056 (lesser) * GOOD_SCORE [getMatrixType( )] +
|
jpayne@69
|
1057 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
|
jpayne@69
|
1058 {
|
jpayne@69
|
1059 Cp = Cip;
|
jpayne@69
|
1060 targetA = eA;
|
jpayne@69
|
1061 targetB = eB;
|
jpayne@69
|
1062 break;
|
jpayne@69
|
1063 }
|
jpayne@69
|
1064 else if ( (greater << 1) - lesser < dist )
|
jpayne@69
|
1065 {
|
jpayne@69
|
1066 Cp = Cip;
|
jpayne@69
|
1067 targetA = eA;
|
jpayne@69
|
1068 targetB = eB;
|
jpayne@69
|
1069 dist = (greater << 1) - lesser;
|
jpayne@69
|
1070 }
|
jpayne@69
|
1071 }
|
jpayne@69
|
1072 }
|
jpayne@69
|
1073 }
|
jpayne@69
|
1074
|
jpayne@69
|
1075
|
jpayne@69
|
1076 return Cp;
|
jpayne@69
|
1077 }
|
jpayne@69
|
1078
|
jpayne@69
|
1079
|
jpayne@69
|
1080
|
jpayne@69
|
1081
|
jpayne@69
|
1082 vector<Alignment>::iterator getReverseTargetAlignment
|
jpayne@69
|
1083 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
|
jpayne@69
|
1084
|
jpayne@69
|
1085 // Return the alignment that is most likely to successfully join (in a
|
jpayne@69
|
1086 // reverse direction) with the current alignment. The returned alignment
|
jpayne@69
|
1087 // must be strictly less than the current cluster and be on a diagonal
|
jpayne@69
|
1088 // close enough to the current alignment, so that a connection
|
jpayne@69
|
1089 // could possibly be made by the alignment extender. Assumes clusters
|
jpayne@69
|
1090 // have been sorted via AscendingClusterSort and processed in order, so
|
jpayne@69
|
1091 // therefore all alignments are in order by their start A coordinate.
|
jpayne@69
|
1092
|
jpayne@69
|
1093 {
|
jpayne@69
|
1094 vector<Alignment>::iterator Ap; // alignment pointer
|
jpayne@69
|
1095 vector<Alignment>::iterator Aip; // alignment iterative pointer
|
jpayne@69
|
1096 long int eA, eB; // possible targets
|
jpayne@69
|
1097 long int greater, lesser; // gap sizes between the two alignments
|
jpayne@69
|
1098 long int sA = CurrAp->sA; // the startA of the current alignment
|
jpayne@69
|
1099 long int sB = CurrAp->sB; // the startB of the current alignment
|
jpayne@69
|
1100
|
jpayne@69
|
1101 //-- Beginning of sequences is the default target, set distance accordingly
|
jpayne@69
|
1102 long int dist = (sA < sB ? sA : sB);
|
jpayne@69
|
1103
|
jpayne@69
|
1104 //-- For all alignments less than the current alignment (on sequence A)
|
jpayne@69
|
1105 Ap = Alignments.end( );
|
jpayne@69
|
1106 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
|
jpayne@69
|
1107 {
|
jpayne@69
|
1108 //-- If the alignment is on the same direction
|
jpayne@69
|
1109 if ( CurrAp->dirB == Aip->dirB )
|
jpayne@69
|
1110 {
|
jpayne@69
|
1111 eA = Aip->eA;
|
jpayne@69
|
1112 eB = Aip->eB;
|
jpayne@69
|
1113
|
jpayne@69
|
1114 //-- If the alignment is strictly less than current cluster
|
jpayne@69
|
1115 if ( eA <= sA && eB <= sB )
|
jpayne@69
|
1116 {
|
jpayne@69
|
1117 if ( sA - eA > sB - eB )
|
jpayne@69
|
1118 {
|
jpayne@69
|
1119 greater = sA - eA;
|
jpayne@69
|
1120 lesser = sB - eB;
|
jpayne@69
|
1121 }
|
jpayne@69
|
1122 else
|
jpayne@69
|
1123 {
|
jpayne@69
|
1124 lesser = sA - eA;
|
jpayne@69
|
1125 greater = sB - eB;
|
jpayne@69
|
1126 }
|
jpayne@69
|
1127
|
jpayne@69
|
1128 //-- If the cluster is close enough
|
jpayne@69
|
1129 if ( greater < getBreakLen( ) ||
|
jpayne@69
|
1130 (lesser) * GOOD_SCORE [getMatrixType( )] +
|
jpayne@69
|
1131 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
|
jpayne@69
|
1132 {
|
jpayne@69
|
1133 Ap = Aip;
|
jpayne@69
|
1134 break;
|
jpayne@69
|
1135 }
|
jpayne@69
|
1136 else if ( (greater << 1) - lesser < dist )
|
jpayne@69
|
1137 {
|
jpayne@69
|
1138 Ap = Aip;
|
jpayne@69
|
1139 dist = (greater << 1) - lesser;
|
jpayne@69
|
1140 }
|
jpayne@69
|
1141 }
|
jpayne@69
|
1142 }
|
jpayne@69
|
1143 }
|
jpayne@69
|
1144
|
jpayne@69
|
1145
|
jpayne@69
|
1146 return Ap;
|
jpayne@69
|
1147 }
|
jpayne@69
|
1148
|
jpayne@69
|
1149
|
jpayne@69
|
1150
|
jpayne@69
|
1151
|
jpayne@69
|
1152 bool isShadowedCluster
|
jpayne@69
|
1153 (vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
1154 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
|
jpayne@69
|
1155
|
jpayne@69
|
1156 // Check if the current cluster is shadowed by a previously produced
|
jpayne@69
|
1157 // alignment region. Return true if it is, else false.
|
jpayne@69
|
1158
|
jpayne@69
|
1159 {
|
jpayne@69
|
1160 vector<Alignment>::iterator Aip; // alignment pointer
|
jpayne@69
|
1161
|
jpayne@69
|
1162 long int sA = CurrCp->matches.begin( )->sA;
|
jpayne@69
|
1163 long int eA = CurrCp->matches.rbegin( )->sA +
|
jpayne@69
|
1164 CurrCp->matches.rbegin( )->len - 1;
|
jpayne@69
|
1165 long int sB = CurrCp->matches.begin( )->sB;
|
jpayne@69
|
1166 long int eB = CurrCp->matches.rbegin( )->sB +
|
jpayne@69
|
1167 CurrCp->matches.rbegin( )->len - 1;
|
jpayne@69
|
1168
|
jpayne@69
|
1169 if ( ! Alignments.empty( ) ) // if there are alignments to use
|
jpayne@69
|
1170 {
|
jpayne@69
|
1171 //-- Look backwards in hope of finding a shadowing alignment
|
jpayne@69
|
1172 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
|
jpayne@69
|
1173 {
|
jpayne@69
|
1174 //-- If in the same direction and shadowing the current cluster, break
|
jpayne@69
|
1175 if ( Aip->dirB == CurrCp->dirB )
|
jpayne@69
|
1176 if ( Aip->eA >= eA && Aip->eB >= eB )
|
jpayne@69
|
1177 if ( Aip->sA <= sA && Aip->sB <= sB )
|
jpayne@69
|
1178 break;
|
jpayne@69
|
1179 }
|
jpayne@69
|
1180
|
jpayne@69
|
1181 //-- Return true if the loop was broken, i.e. shadow found
|
jpayne@69
|
1182 if ( Aip >= Alignments.begin( ) )
|
jpayne@69
|
1183 return true;
|
jpayne@69
|
1184 }
|
jpayne@69
|
1185
|
jpayne@69
|
1186 //-- Return false if Alignments was empty or loop was not broken
|
jpayne@69
|
1187 return false;
|
jpayne@69
|
1188 }
|
jpayne@69
|
1189
|
jpayne@69
|
1190
|
jpayne@69
|
1191
|
jpayne@69
|
1192
|
jpayne@69
|
1193 void parseAbort
|
jpayne@69
|
1194 (const char * s)
|
jpayne@69
|
1195
|
jpayne@69
|
1196 // Abort the program if there was an error in parsing file 's'
|
jpayne@69
|
1197
|
jpayne@69
|
1198 {
|
jpayne@69
|
1199 fprintf (stderr,
|
jpayne@69
|
1200 "ERROR: Could not parse input from '%s'. \n"
|
jpayne@69
|
1201 "Please check the filename and format, or file a bug report\n", s);
|
jpayne@69
|
1202 exit (EXIT_FAILURE);
|
jpayne@69
|
1203 }
|
jpayne@69
|
1204
|
jpayne@69
|
1205
|
jpayne@69
|
1206
|
jpayne@69
|
1207
|
jpayne@69
|
1208 void parseDelta
|
jpayne@69
|
1209 (vector<Alignment> & Alignments,
|
jpayne@69
|
1210 const FastaRecord * Af, const FastaRecord *Bf)
|
jpayne@69
|
1211
|
jpayne@69
|
1212 // Use the delta information to generate the error counts for each
|
jpayne@69
|
1213 // alignment, and fill this information into the data type
|
jpayne@69
|
1214
|
jpayne@69
|
1215 {
|
jpayne@69
|
1216 char * A, * B, * Brev = NULL;
|
jpayne@69
|
1217 char ch1, ch2;
|
jpayne@69
|
1218 long int Delta;
|
jpayne@69
|
1219 int Sign;
|
jpayne@69
|
1220 long int i, Apos, Bpos;
|
jpayne@69
|
1221 long int Remain, Total;
|
jpayne@69
|
1222 long int Errors, SimErrors;
|
jpayne@69
|
1223 long int NonAlphas;
|
jpayne@69
|
1224 vector<Alignment>::iterator Ap;
|
jpayne@69
|
1225 vector<long int>::iterator Dp;
|
jpayne@69
|
1226
|
jpayne@69
|
1227 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1228 {
|
jpayne@69
|
1229 A = Af->seq;
|
jpayne@69
|
1230 B = Bf->seq;
|
jpayne@69
|
1231
|
jpayne@69
|
1232 if ( Ap->dirB == REVERSE_CHAR )
|
jpayne@69
|
1233 {
|
jpayne@69
|
1234 if ( Brev == NULL )
|
jpayne@69
|
1235 {
|
jpayne@69
|
1236 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
|
jpayne@69
|
1237 strcpy ( Brev + 1, Bf->seq + 1 );
|
jpayne@69
|
1238 Brev[0] = '\0';
|
jpayne@69
|
1239 Reverse_Complement (Brev, 1, Bf->len);
|
jpayne@69
|
1240 B = Brev;
|
jpayne@69
|
1241 }
|
jpayne@69
|
1242 B = Brev;
|
jpayne@69
|
1243 }
|
jpayne@69
|
1244
|
jpayne@69
|
1245 Apos = Ap->sA;
|
jpayne@69
|
1246 Bpos = Ap->sB;
|
jpayne@69
|
1247
|
jpayne@69
|
1248 Errors = 0;
|
jpayne@69
|
1249 SimErrors = 0;
|
jpayne@69
|
1250 NonAlphas = 0;
|
jpayne@69
|
1251 Remain = Ap->eA - Ap->sA + 1;
|
jpayne@69
|
1252 Total = Remain;
|
jpayne@69
|
1253
|
jpayne@69
|
1254 //-- For all delta's in this alignment
|
jpayne@69
|
1255 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
|
jpayne@69
|
1256 {
|
jpayne@69
|
1257 Delta = *Dp;
|
jpayne@69
|
1258 Sign = Delta > 0 ? 1 : -1;
|
jpayne@69
|
1259 Delta = labs ( Delta );
|
jpayne@69
|
1260
|
jpayne@69
|
1261 //-- For all the bases before the next indel
|
jpayne@69
|
1262 for ( i = 1; i < Delta; i ++ )
|
jpayne@69
|
1263 {
|
jpayne@69
|
1264 ch1 = A [Apos ++];
|
jpayne@69
|
1265 ch2 = B [Bpos ++];
|
jpayne@69
|
1266
|
jpayne@69
|
1267 if ( !isalpha (ch1) )
|
jpayne@69
|
1268 {
|
jpayne@69
|
1269 ch1 = STOP_CHAR;
|
jpayne@69
|
1270 NonAlphas ++;
|
jpayne@69
|
1271 }
|
jpayne@69
|
1272 if ( !isalpha (ch2) )
|
jpayne@69
|
1273 {
|
jpayne@69
|
1274 ch2 = STOP_CHAR;
|
jpayne@69
|
1275 NonAlphas ++;
|
jpayne@69
|
1276 }
|
jpayne@69
|
1277
|
jpayne@69
|
1278 ch1 = toupper(ch1);
|
jpayne@69
|
1279 ch2 = toupper(ch2);
|
jpayne@69
|
1280 if ( 1 > MATCH_SCORE
|
jpayne@69
|
1281 [getMatrixType( )]
|
jpayne@69
|
1282 [ch1 - 'A']
|
jpayne@69
|
1283 [ch2 - 'A'] )
|
jpayne@69
|
1284 SimErrors ++;
|
jpayne@69
|
1285 if ( ch1 != ch2 )
|
jpayne@69
|
1286 Errors ++;
|
jpayne@69
|
1287 }
|
jpayne@69
|
1288
|
jpayne@69
|
1289 //-- Process the current indel
|
jpayne@69
|
1290 Remain -= i - 1;
|
jpayne@69
|
1291 Errors ++;
|
jpayne@69
|
1292 SimErrors ++;
|
jpayne@69
|
1293
|
jpayne@69
|
1294 if ( Sign == 1 )
|
jpayne@69
|
1295 {
|
jpayne@69
|
1296 Apos ++;
|
jpayne@69
|
1297 Remain --;
|
jpayne@69
|
1298 }
|
jpayne@69
|
1299 else
|
jpayne@69
|
1300 {
|
jpayne@69
|
1301 Bpos ++;
|
jpayne@69
|
1302 Total ++;
|
jpayne@69
|
1303 }
|
jpayne@69
|
1304 }
|
jpayne@69
|
1305
|
jpayne@69
|
1306 //-- For all the bases after the final indel
|
jpayne@69
|
1307 for ( i = 0; i < Remain; i ++ )
|
jpayne@69
|
1308 {
|
jpayne@69
|
1309 //-- Score character match and update error counters
|
jpayne@69
|
1310 ch1 = A [Apos ++];
|
jpayne@69
|
1311 ch2 = B [Bpos ++];
|
jpayne@69
|
1312
|
jpayne@69
|
1313 if ( !isalpha (ch1) )
|
jpayne@69
|
1314 {
|
jpayne@69
|
1315 ch1 = STOP_CHAR;
|
jpayne@69
|
1316 NonAlphas ++;
|
jpayne@69
|
1317 }
|
jpayne@69
|
1318 if ( !isalpha (ch2) )
|
jpayne@69
|
1319 {
|
jpayne@69
|
1320 ch2 = STOP_CHAR;
|
jpayne@69
|
1321 NonAlphas ++;
|
jpayne@69
|
1322 }
|
jpayne@69
|
1323
|
jpayne@69
|
1324 ch1 = toupper(ch1);
|
jpayne@69
|
1325 ch2 = toupper(ch2);
|
jpayne@69
|
1326 if ( 1 > MATCH_SCORE
|
jpayne@69
|
1327 [getMatrixType( )]
|
jpayne@69
|
1328 [ch1 - 'A']
|
jpayne@69
|
1329 [ch2 - 'A'] )
|
jpayne@69
|
1330 SimErrors ++;
|
jpayne@69
|
1331 if ( ch1 != ch2 )
|
jpayne@69
|
1332 Errors ++;
|
jpayne@69
|
1333 }
|
jpayne@69
|
1334
|
jpayne@69
|
1335 Ap->Errors = Errors;
|
jpayne@69
|
1336 Ap->SimErrors = SimErrors;
|
jpayne@69
|
1337 Ap->NonAlphas = NonAlphas;
|
jpayne@69
|
1338 }
|
jpayne@69
|
1339
|
jpayne@69
|
1340 if ( Brev != NULL )
|
jpayne@69
|
1341 free ( Brev );
|
jpayne@69
|
1342
|
jpayne@69
|
1343 return;
|
jpayne@69
|
1344 }
|
jpayne@69
|
1345
|
jpayne@69
|
1346
|
jpayne@69
|
1347
|
jpayne@69
|
1348
|
jpayne@69
|
1349 void processSyntenys
|
jpayne@69
|
1350 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
|
jpayne@69
|
1351 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
|
jpayne@69
|
1352
|
jpayne@69
|
1353 // For each syntenic region with clusters, read in the B sequence and
|
jpayne@69
|
1354 // extend the clusters to expand total alignment coverage. Only should
|
jpayne@69
|
1355 // be called once all the clusters for the contained syntenic regions have
|
jpayne@69
|
1356 // been stored in the data structure. Frees the memory used by the
|
jpayne@69
|
1357 // the syntenic regions once the output of extendClusters and
|
jpayne@69
|
1358 // flushSyntenys has been produced.
|
jpayne@69
|
1359
|
jpayne@69
|
1360 {
|
jpayne@69
|
1361 FastaRecord Bf; // the current B sequence
|
jpayne@69
|
1362 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
|
jpayne@69
|
1363
|
jpayne@69
|
1364 long int InitSize = INIT_SIZE; // the initial memory capacity of B
|
jpayne@69
|
1365 vector<Synteny>::iterator CurrSp; // current synteny pointer
|
jpayne@69
|
1366
|
jpayne@69
|
1367 //-- Initialize the B sequence storage
|
jpayne@69
|
1368 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
1369
|
jpayne@69
|
1370 //-- For all the contained syntenys
|
jpayne@69
|
1371 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
|
jpayne@69
|
1372 {
|
jpayne@69
|
1373 //-- If no clusters, ignore
|
jpayne@69
|
1374 if ( CurrSp->clusters.empty( ) )
|
jpayne@69
|
1375 continue;
|
jpayne@69
|
1376
|
jpayne@69
|
1377 //-- If a B sequence not seen yet, read it in
|
jpayne@69
|
1378 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
|
jpayne@69
|
1379 // ordered as output by mgaps, if they are not in order the program
|
jpayne@69
|
1380 // will fail. (All like tags must be adjacent and in the same order
|
jpayne@69
|
1381 // as the query file)
|
jpayne@69
|
1382 if ( CurrSp == Syntenys.begin( ) ||
|
jpayne@69
|
1383 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
|
jpayne@69
|
1384 {
|
jpayne@69
|
1385 //-- Read in the B sequence
|
jpayne@69
|
1386 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
|
jpayne@69
|
1387 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
|
jpayne@69
|
1388 break;
|
jpayne@69
|
1389 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
|
jpayne@69
|
1390 parseAbort ( "Query File" );
|
jpayne@69
|
1391 Bf.len = strlen (Bf.seq + 1);
|
jpayne@69
|
1392 }
|
jpayne@69
|
1393
|
jpayne@69
|
1394 //-- Extend clusters and create the alignment information
|
jpayne@69
|
1395 CurrSp->Bf.len = Bf.len;
|
jpayne@69
|
1396 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
|
jpayne@69
|
1397 }
|
jpayne@69
|
1398
|
jpayne@69
|
1399 //-- Create the cluster information
|
jpayne@69
|
1400 flushSyntenys (Syntenys, ClusterFile);
|
jpayne@69
|
1401
|
jpayne@69
|
1402 free (Bf.Id);
|
jpayne@69
|
1403 free (Bf.seq);
|
jpayne@69
|
1404
|
jpayne@69
|
1405 return;
|
jpayne@69
|
1406 }
|
jpayne@69
|
1407
|
jpayne@69
|
1408
|
jpayne@69
|
1409
|
jpayne@69
|
1410
|
jpayne@69
|
1411 inline long int revC
|
jpayne@69
|
1412 (long int Coord, long int Len)
|
jpayne@69
|
1413
|
jpayne@69
|
1414 // Reverse complement the given coordinate for the given length.
|
jpayne@69
|
1415
|
jpayne@69
|
1416 {
|
jpayne@69
|
1417 assert (Len - Coord + 1 > 0);
|
jpayne@69
|
1418 return (Len - Coord + 1);
|
jpayne@69
|
1419 }
|
jpayne@69
|
1420
|
jpayne@69
|
1421
|
jpayne@69
|
1422
|
jpayne@69
|
1423
|
jpayne@69
|
1424 void printHelp
|
jpayne@69
|
1425 (const char * s)
|
jpayne@69
|
1426
|
jpayne@69
|
1427 // Display the program's help information to stderr.
|
jpayne@69
|
1428
|
jpayne@69
|
1429 {
|
jpayne@69
|
1430 fprintf (stderr,
|
jpayne@69
|
1431 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
|
jpayne@69
|
1432 fprintf (stderr,
|
jpayne@69
|
1433 "-b int set the alignment break (give-up) length to int\n"
|
jpayne@69
|
1434 "-B int set the diagonal banding for extension to int\n"
|
jpayne@69
|
1435 "-d output only match clusters rather than extended alignments\n"
|
jpayne@69
|
1436 "-e do not extend alignments outward from clusters\n"
|
jpayne@69
|
1437 "-h display help information\n"
|
jpayne@69
|
1438 "-s don't remove shadowed alignments, useful for aligning a\n"
|
jpayne@69
|
1439 " sequence to itself to identify repeats\n"
|
jpayne@69
|
1440 "-t force alignment to ends of sequence if within -b distance\n\n");
|
jpayne@69
|
1441 fprintf (stderr,
|
jpayne@69
|
1442 " Input is the output of the \"mgaps\" program from stdin, and\n"
|
jpayne@69
|
1443 "the two original NUCmer sequence files passed on the command\n"
|
jpayne@69
|
1444 "line. <pfx> is the prefix to be added to the front of the\n"
|
jpayne@69
|
1445 "output file <pfx>.delta\n"
|
jpayne@69
|
1446 " <pfx>.delta is the alignment object that catalogs the distance\n"
|
jpayne@69
|
1447 "between insertions and deletions. For further information\n"
|
jpayne@69
|
1448 "regarding this file, please refer to the documentation under\n"
|
jpayne@69
|
1449 "the .delta output description.\n\n");
|
jpayne@69
|
1450 return;
|
jpayne@69
|
1451 }
|
jpayne@69
|
1452
|
jpayne@69
|
1453
|
jpayne@69
|
1454
|
jpayne@69
|
1455
|
jpayne@69
|
1456 void printUsage
|
jpayne@69
|
1457 (const char * s)
|
jpayne@69
|
1458
|
jpayne@69
|
1459 // Display the program's usage information to stderr.
|
jpayne@69
|
1460
|
jpayne@69
|
1461 {
|
jpayne@69
|
1462 fprintf (stderr,
|
jpayne@69
|
1463 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
|
jpayne@69
|
1464 fprintf (stderr, "Try '%s -h' for more information.\n", s);
|
jpayne@69
|
1465 return;
|
jpayne@69
|
1466 }
|
jpayne@69
|
1467
|
jpayne@69
|
1468
|
jpayne@69
|
1469
|
jpayne@69
|
1470
|
jpayne@69
|
1471 void validateData
|
jpayne@69
|
1472 (vector<Alignment> Alignments, vector<Cluster> Clusters,
|
jpayne@69
|
1473 const FastaRecord * Af, const FastaRecord * Bf)
|
jpayne@69
|
1474
|
jpayne@69
|
1475 // Self test function to check that the cluster and alignment information
|
jpayne@69
|
1476 // is valid
|
jpayne@69
|
1477
|
jpayne@69
|
1478 {
|
jpayne@69
|
1479 char * A, * B, * Brev = NULL;
|
jpayne@69
|
1480 vector<Cluster>::iterator Cp;
|
jpayne@69
|
1481 vector<Match>::iterator Mp;
|
jpayne@69
|
1482 vector<Alignment>::iterator Ap;
|
jpayne@69
|
1483 long int x, y, i;
|
jpayne@69
|
1484 char Xc, Yc;
|
jpayne@69
|
1485
|
jpayne@69
|
1486 A = Af->seq;
|
jpayne@69
|
1487
|
jpayne@69
|
1488 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
|
jpayne@69
|
1489 {
|
jpayne@69
|
1490 assert ( Cp->wasFused );
|
jpayne@69
|
1491
|
jpayne@69
|
1492 //-- Pick the right directional sequence for B
|
jpayne@69
|
1493 if ( Cp->dirB == FORWARD_CHAR )
|
jpayne@69
|
1494 B = Bf->seq;
|
jpayne@69
|
1495 else if ( Brev != NULL )
|
jpayne@69
|
1496 B = Brev;
|
jpayne@69
|
1497 else
|
jpayne@69
|
1498 {
|
jpayne@69
|
1499 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
|
jpayne@69
|
1500 strcpy ( Brev + 1, Bf->seq + 1 );
|
jpayne@69
|
1501 Brev[0] = '\0';
|
jpayne@69
|
1502 Reverse_Complement (Brev, 1, Bf->len);
|
jpayne@69
|
1503 B = Brev;
|
jpayne@69
|
1504 }
|
jpayne@69
|
1505
|
jpayne@69
|
1506 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
|
jpayne@69
|
1507 {
|
jpayne@69
|
1508 //-- assert for each match in cluster, it is indeed a match
|
jpayne@69
|
1509 x = Mp->sA;
|
jpayne@69
|
1510 y = Mp->sB;
|
jpayne@69
|
1511 for ( i = 0; i < Mp->len; i ++ )
|
jpayne@69
|
1512 assert ( A[x ++] == B[y ++] );
|
jpayne@69
|
1513
|
jpayne@69
|
1514 //-- assert for each match in cluster, it is contained in an alignment
|
jpayne@69
|
1515 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1516 {
|
jpayne@69
|
1517 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
|
jpayne@69
|
1518 Ap->eA >= Mp->sA + Mp->len - 1 &&
|
jpayne@69
|
1519 Ap->eB >= Mp->sB + Mp->len - 1 )
|
jpayne@69
|
1520 break;
|
jpayne@69
|
1521 }
|
jpayne@69
|
1522 assert ( Ap < Alignments.end( ) );
|
jpayne@69
|
1523 }
|
jpayne@69
|
1524 }
|
jpayne@69
|
1525
|
jpayne@69
|
1526 //-- assert alignments are optimal (quick check if first and last chars equal)
|
jpayne@69
|
1527 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1528 {
|
jpayne@69
|
1529 if ( Ap->dirB == REVERSE_CHAR )
|
jpayne@69
|
1530 {
|
jpayne@69
|
1531 assert (Brev != NULL);
|
jpayne@69
|
1532 B = Brev;
|
jpayne@69
|
1533 }
|
jpayne@69
|
1534 else
|
jpayne@69
|
1535 B = Bf->seq;
|
jpayne@69
|
1536
|
jpayne@69
|
1537 assert ( Ap->sA <= Ap->eA );
|
jpayne@69
|
1538 assert ( Ap->sB <= Ap->eB );
|
jpayne@69
|
1539
|
jpayne@69
|
1540 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
|
jpayne@69
|
1541 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
|
jpayne@69
|
1542 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
|
jpayne@69
|
1543 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
|
jpayne@69
|
1544
|
jpayne@69
|
1545 Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR);
|
jpayne@69
|
1546 Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR);
|
jpayne@69
|
1547 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
|
jpayne@69
|
1548
|
jpayne@69
|
1549 Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR);
|
jpayne@69
|
1550 Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR);
|
jpayne@69
|
1551 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
|
jpayne@69
|
1552 }
|
jpayne@69
|
1553
|
jpayne@69
|
1554 if ( Brev != NULL )
|
jpayne@69
|
1555 free (Brev);
|
jpayne@69
|
1556
|
jpayne@69
|
1557 return;
|
jpayne@69
|
1558 }
|