jpayne@69
|
1 //------------------------------------------------------------------------------
|
jpayne@69
|
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
|
jpayne@69
|
3 // File: postpro.cc
|
jpayne@69
|
4 // Date: 08 / 20 / 2002
|
jpayne@69
|
5 //
|
jpayne@69
|
6 // Purpose: To translate the coordinates referencing the concatenated
|
jpayne@69
|
7 // reference sequences back to the individual sequences and deal
|
jpayne@69
|
8 // with any conflict that may have arisen (i.e. a MUM that spans
|
jpayne@69
|
9 // the boundry between two sequences). Then to extend each cluster
|
jpayne@69
|
10 // via Smith-Waterman techniques to expand the alignment coverage.
|
jpayne@69
|
11 // Alignments which encounter each other will be fused to form one
|
jpayne@69
|
12 // encompasing alignment when appropriate.
|
jpayne@69
|
13 //
|
jpayne@69
|
14 // Input: Input is the output of the .mgaps program from stdin. On the
|
jpayne@69
|
15 // command line, the file names of the two original sequence files
|
jpayne@69
|
16 // should come first, followed by the prefix <pfx> that should be
|
jpayne@69
|
17 // placed in front of the two output filenames <pfx>.cluster and
|
jpayne@69
|
18 // <pfx>.delta
|
jpayne@69
|
19 //
|
jpayne@69
|
20 // NOTE: Cluster file is now suppressed by default (see -d option).
|
jpayne@69
|
21 //
|
jpayne@69
|
22 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
|
jpayne@69
|
23 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
|
jpayne@69
|
24 // However, the clusters now reference their corresponding
|
jpayne@69
|
25 // sequence and are all listed under headers that specify this
|
jpayne@69
|
26 // sequence. In addition, the coordinates now reference the original
|
jpayne@69
|
27 // DNA input and not the amino acid translations. The <pfx>.delta file
|
jpayne@69
|
28 // is the alignment object that contains all the information necessary
|
jpayne@69
|
29 // to reproduce the alignments generated by the MUM extension process.
|
jpayne@69
|
30 // The coordinates in this file reference the original DNA input, however
|
jpayne@69
|
31 // the delta values represent amino acids, so 1 delta = 3 nucleotides.
|
jpayne@69
|
32 // Please refer to the output file documentation for an in-depth
|
jpayne@69
|
33 // description of these file formats.
|
jpayne@69
|
34 //
|
jpayne@69
|
35 // Usage: postpro <reference> <query> <pfx> < <input>
|
jpayne@69
|
36 // Where <reference> and <query> are the original input sequences of
|
jpayne@69
|
37 // PROmer and <pfx> is the prefix that should be added to the
|
jpayne@69
|
38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
|
jpayne@69
|
39 //
|
jpayne@69
|
40 //------------------------------------------------------------------------------
|
jpayne@69
|
41
|
jpayne@69
|
42 //-- NOTE: this option will significantly hamper program performance,
|
jpayne@69
|
43 // mostly the alignment extension performance (sw_align.h)
|
jpayne@69
|
44 //#define _DEBUG_ASSERT // self testing assert functions
|
jpayne@69
|
45
|
jpayne@69
|
46 #include "tigrinc.hh"
|
jpayne@69
|
47 #include "sw_align.hh"
|
jpayne@69
|
48 #include "translate.hh"
|
jpayne@69
|
49 #include <vector>
|
jpayne@69
|
50 #include <algorithm>
|
jpayne@69
|
51 using namespace std;
|
jpayne@69
|
52
|
jpayne@69
|
53
|
jpayne@69
|
54
|
jpayne@69
|
55
|
jpayne@69
|
56 //------------------------------------------------------ Globals -------------//
|
jpayne@69
|
57 bool DO_DELTA = true;
|
jpayne@69
|
58 bool DO_EXTEND = true;
|
jpayne@69
|
59 bool TO_SEQEND = false;
|
jpayne@69
|
60
|
jpayne@69
|
61
|
jpayne@69
|
62 //------------------------------------------------------ Type Definitions ----//
|
jpayne@69
|
63 enum LineType
|
jpayne@69
|
64 //-- The type of input line from <stdin>
|
jpayne@69
|
65 {
|
jpayne@69
|
66 NO_LINE, HEADER_LINE, MATCH_LINE
|
jpayne@69
|
67 };
|
jpayne@69
|
68
|
jpayne@69
|
69
|
jpayne@69
|
70 struct FastaRecord
|
jpayne@69
|
71 //-- The essential data of a sequence
|
jpayne@69
|
72 {
|
jpayne@69
|
73 char * Id; // the fasta ID header tag
|
jpayne@69
|
74 long int len; // the length of the sequence
|
jpayne@69
|
75 char * seq; // the sequence data
|
jpayne@69
|
76 };
|
jpayne@69
|
77
|
jpayne@69
|
78
|
jpayne@69
|
79 struct Match
|
jpayne@69
|
80 //-- An exact match between two sequences A and B
|
jpayne@69
|
81 {
|
jpayne@69
|
82 long int sA, sB, len; // start coordinate in A, in B and the length
|
jpayne@69
|
83 };
|
jpayne@69
|
84
|
jpayne@69
|
85
|
jpayne@69
|
86 struct Cluster
|
jpayne@69
|
87 //-- An ordered list of matches between two sequences A and B
|
jpayne@69
|
88 {
|
jpayne@69
|
89 bool wasFused; // have the cluster matches been extended yet?
|
jpayne@69
|
90 int frameA; // the reference sequence frame (1-6)
|
jpayne@69
|
91 int frameB; // the query sequence frame
|
jpayne@69
|
92 vector<Match> matches; // the ordered set of matches in the cluster
|
jpayne@69
|
93 };
|
jpayne@69
|
94
|
jpayne@69
|
95
|
jpayne@69
|
96 struct Synteny
|
jpayne@69
|
97 //-- An ordered list of clusters between two sequences A and B
|
jpayne@69
|
98 {
|
jpayne@69
|
99 FastaRecord * AfP; // a pointer to the reference sequence record
|
jpayne@69
|
100 FastaRecord Bf; // the query sequence record (w/o the sequence)
|
jpayne@69
|
101 vector<Cluster> clusters; // the ordered set of clusters between A and B
|
jpayne@69
|
102 };
|
jpayne@69
|
103
|
jpayne@69
|
104
|
jpayne@69
|
105 struct Alignment
|
jpayne@69
|
106 //-- An alignment object between two sequences A and B
|
jpayne@69
|
107 {
|
jpayne@69
|
108 int frameA; // the reference sequence frame (1-6)
|
jpayne@69
|
109 int frameB; // the query sequence frame
|
jpayne@69
|
110 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
|
jpayne@69
|
111 vector<long int> delta; // the delta values, with NO zero at the end
|
jpayne@69
|
112 long int deltaApos; // sum of abs(deltas) - #of negative deltas
|
jpayne@69
|
113 // trust me, it is a very helpful value
|
jpayne@69
|
114 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
|
jpayne@69
|
115 };
|
jpayne@69
|
116
|
jpayne@69
|
117
|
jpayne@69
|
118 struct AscendingClusterSort
|
jpayne@69
|
119 //-- For sorting clusters in ascending order of their sA coordinate
|
jpayne@69
|
120 {
|
jpayne@69
|
121 bool operator() (const Cluster & pA, const Cluster & pB)
|
jpayne@69
|
122 {
|
jpayne@69
|
123 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
|
jpayne@69
|
124 }
|
jpayne@69
|
125 };
|
jpayne@69
|
126
|
jpayne@69
|
127
|
jpayne@69
|
128
|
jpayne@69
|
129
|
jpayne@69
|
130
|
jpayne@69
|
131 //------------------------------------------------- Function Declarations ----//
|
jpayne@69
|
132 void addNewAlignment
|
jpayne@69
|
133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
|
jpayne@69
|
134 vector<Match>::iterator Mp);
|
jpayne@69
|
135
|
jpayne@69
|
136 bool extendBackward
|
jpayne@69
|
137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
|
jpayne@69
|
138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
|
jpayne@69
|
139
|
jpayne@69
|
140 bool extendForward
|
jpayne@69
|
141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
|
jpayne@69
|
142 const char * B, long int targetB, unsigned int m_o);
|
jpayne@69
|
143
|
jpayne@69
|
144 void extendClusters
|
jpayne@69
|
145 (vector<Cluster> & Clusters,
|
jpayne@69
|
146 const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile);
|
jpayne@69
|
147
|
jpayne@69
|
148 void flushAlignments
|
jpayne@69
|
149 (vector<Alignment> & Alignments,
|
jpayne@69
|
150 const FastaRecord * Af, const FastaRecord * Bf,
|
jpayne@69
|
151 FILE * DeltaFile);
|
jpayne@69
|
152
|
jpayne@69
|
153 void flushSyntenys
|
jpayne@69
|
154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
|
jpayne@69
|
155
|
jpayne@69
|
156 vector<Cluster>::iterator getForwardTargetCluster
|
jpayne@69
|
157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
158 long int & targetA, long int & targetB);
|
jpayne@69
|
159
|
jpayne@69
|
160 vector<Alignment>::iterator getReverseTargetAlignment
|
jpayne@69
|
161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
|
jpayne@69
|
162
|
jpayne@69
|
163 bool isShadowedCluster
|
jpayne@69
|
164 (vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
|
jpayne@69
|
166
|
jpayne@69
|
167 void parseAbort
|
jpayne@69
|
168 (const char *);
|
jpayne@69
|
169
|
jpayne@69
|
170 void parseDelta
|
jpayne@69
|
171 (vector<Alignment> & Alignments,
|
jpayne@69
|
172 const FastaRecord * Af, const FastaRecord *Bf);
|
jpayne@69
|
173
|
jpayne@69
|
174 void processSyntenys
|
jpayne@69
|
175 (vector<Synteny> & Syntenys,
|
jpayne@69
|
176 FastaRecord * Af, long int As,
|
jpayne@69
|
177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
|
jpayne@69
|
178
|
jpayne@69
|
179 inline long int transC
|
jpayne@69
|
180 (long int Coord, int Frame, long int Len);
|
jpayne@69
|
181
|
jpayne@69
|
182 inline long int refLen
|
jpayne@69
|
183 (long int ntLen);
|
jpayne@69
|
184
|
jpayne@69
|
185 inline long int revC
|
jpayne@69
|
186 (long int Coord, long int Len);
|
jpayne@69
|
187
|
jpayne@69
|
188 void printHelp
|
jpayne@69
|
189 (const char *);
|
jpayne@69
|
190
|
jpayne@69
|
191 void printUsage
|
jpayne@69
|
192 (const char *);
|
jpayne@69
|
193
|
jpayne@69
|
194 void validateData
|
jpayne@69
|
195 (vector<Alignment> Alignments, vector<Cluster> Clusters,
|
jpayne@69
|
196 const FastaRecord * Af, const FastaRecord * Bf);
|
jpayne@69
|
197
|
jpayne@69
|
198
|
jpayne@69
|
199
|
jpayne@69
|
200 //-------------------------------------------------- Function Definitions ----//
|
jpayne@69
|
201 int main
|
jpayne@69
|
202 (int argc, char ** argv)
|
jpayne@69
|
203 {
|
jpayne@69
|
204 FastaRecord * Af; // array of all the reference sequences
|
jpayne@69
|
205
|
jpayne@69
|
206 vector<Synteny> Syntenys; // vector of all sets of clusters
|
jpayne@69
|
207 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
|
jpayne@69
|
208 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
|
jpayne@69
|
209
|
jpayne@69
|
210 Synteny Asyn; // a single set of clusters
|
jpayne@69
|
211 Cluster Aclu; // a single cluster of matches
|
jpayne@69
|
212 Match Amat; // a single match
|
jpayne@69
|
213
|
jpayne@69
|
214 LineType PrevLine; // the current input line
|
jpayne@69
|
215
|
jpayne@69
|
216 bool Found; // temporary search flag
|
jpayne@69
|
217
|
jpayne@69
|
218 char Line [MAX_LINE]; // a single line of input
|
jpayne@69
|
219 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
|
jpayne@69
|
220 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
|
jpayne@69
|
221 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
|
jpayne@69
|
222
|
jpayne@69
|
223 int FrameA, FrameB; // reading frames (1-6)
|
jpayne@69
|
224 int CurrFrameA, CurrFrameB;
|
jpayne@69
|
225
|
jpayne@69
|
226 long int i; // temporary counter
|
jpayne@69
|
227 long int Seqi; // current reference sequence index
|
jpayne@69
|
228 long int InitSize; // initial sequence memory space
|
jpayne@69
|
229 long int As = 0; // the size of the reference array
|
jpayne@69
|
230 long int Ac = 100; // the capacity of the reference array
|
jpayne@69
|
231 long int sA, sB, len; // current match start in A, B and length
|
jpayne@69
|
232
|
jpayne@69
|
233 FILE * RefFile, * QryFile; // reference and query input files
|
jpayne@69
|
234 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
|
jpayne@69
|
235
|
jpayne@69
|
236 //-- Set the alignment data type and break length (sw_align.h)
|
jpayne@69
|
237 setMatrixType ( BLOSUM62 );
|
jpayne@69
|
238 setBreakLen ( 60 );
|
jpayne@69
|
239
|
jpayne@69
|
240 //-- Parse the command line arguments
|
jpayne@69
|
241 {
|
jpayne@69
|
242 optarg = NULL;
|
jpayne@69
|
243 int ch, errflg = 0;
|
jpayne@69
|
244 while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) )
|
jpayne@69
|
245 switch (ch)
|
jpayne@69
|
246 {
|
jpayne@69
|
247 case 'b' :
|
jpayne@69
|
248 setBreakLen (atoi (optarg));
|
jpayne@69
|
249 break;
|
jpayne@69
|
250
|
jpayne@69
|
251 case 'd' :
|
jpayne@69
|
252 DO_DELTA = false;
|
jpayne@69
|
253 break;
|
jpayne@69
|
254
|
jpayne@69
|
255 case 'e' :
|
jpayne@69
|
256 DO_EXTEND = false;
|
jpayne@69
|
257 break;
|
jpayne@69
|
258
|
jpayne@69
|
259 case 'h' :
|
jpayne@69
|
260 printHelp (argv[0]);
|
jpayne@69
|
261 exit (EXIT_SUCCESS);
|
jpayne@69
|
262 break;
|
jpayne@69
|
263
|
jpayne@69
|
264 case 't' :
|
jpayne@69
|
265 TO_SEQEND = true;
|
jpayne@69
|
266 break;
|
jpayne@69
|
267
|
jpayne@69
|
268 case 'x' :
|
jpayne@69
|
269 setMatrixType ( atoi (optarg) );
|
jpayne@69
|
270 if ( getMatrixType( ) == NUCLEOTIDE )
|
jpayne@69
|
271 {
|
jpayne@69
|
272 fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n",
|
jpayne@69
|
273 getMatrixType( ));
|
jpayne@69
|
274 setMatrixType ( BLOSUM62 );
|
jpayne@69
|
275 }
|
jpayne@69
|
276 break;
|
jpayne@69
|
277
|
jpayne@69
|
278 default :
|
jpayne@69
|
279 errflg ++;
|
jpayne@69
|
280 }
|
jpayne@69
|
281
|
jpayne@69
|
282 if ( errflg > 0 || argc - optind != 3 )
|
jpayne@69
|
283 {
|
jpayne@69
|
284 printUsage (argv[0]);
|
jpayne@69
|
285 exit (EXIT_FAILURE);
|
jpayne@69
|
286 }
|
jpayne@69
|
287 }
|
jpayne@69
|
288
|
jpayne@69
|
289 //-- Read and create the I/O file names
|
jpayne@69
|
290 strcpy (RefFileName, argv[optind ++]);
|
jpayne@69
|
291 strcpy (QryFileName, argv[optind ++]);
|
jpayne@69
|
292 strcpy (ClusterFileName, argv[optind ++]);
|
jpayne@69
|
293 strcpy (DeltaFileName, ClusterFileName);
|
jpayne@69
|
294 strcat (ClusterFileName, ".cluster");
|
jpayne@69
|
295 strcat (DeltaFileName, ".delta");
|
jpayne@69
|
296
|
jpayne@69
|
297 //-- Open all the files
|
jpayne@69
|
298 RefFile = File_Open (RefFileName, "r");
|
jpayne@69
|
299 QryFile = File_Open (QryFileName, "r");
|
jpayne@69
|
300 if ( DO_DELTA )
|
jpayne@69
|
301 {
|
jpayne@69
|
302 ClusterFile = NULL;
|
jpayne@69
|
303 DeltaFile = File_Open (DeltaFileName, "w");
|
jpayne@69
|
304 }
|
jpayne@69
|
305 else
|
jpayne@69
|
306 {
|
jpayne@69
|
307 ClusterFile = File_Open (ClusterFileName, "w");
|
jpayne@69
|
308 DeltaFile = NULL;
|
jpayne@69
|
309 }
|
jpayne@69
|
310
|
jpayne@69
|
311 //-- Print the headers of the output files
|
jpayne@69
|
312 if ( DO_DELTA )
|
jpayne@69
|
313 fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
|
jpayne@69
|
314 else
|
jpayne@69
|
315 fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
|
jpayne@69
|
316
|
jpayne@69
|
317 //-- Generate the array of the reference sequences
|
jpayne@69
|
318 InitSize = INIT_SIZE;
|
jpayne@69
|
319 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
|
jpayne@69
|
320 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
321 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
|
jpayne@69
|
322 {
|
jpayne@69
|
323 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
|
jpayne@69
|
324 strcpy (Af[As].Id, IdA);
|
jpayne@69
|
325
|
jpayne@69
|
326 Af[As].len = strlen (Af[As].seq + 1);
|
jpayne@69
|
327
|
jpayne@69
|
328 if ( ++ As >= Ac )
|
jpayne@69
|
329 {
|
jpayne@69
|
330 Ac *= 2;
|
jpayne@69
|
331 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
|
jpayne@69
|
332 }
|
jpayne@69
|
333
|
jpayne@69
|
334 InitSize = INIT_SIZE;
|
jpayne@69
|
335 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
336 }
|
jpayne@69
|
337 fclose (RefFile);
|
jpayne@69
|
338 if ( As <= 0 )
|
jpayne@69
|
339 parseAbort ( RefFileName );
|
jpayne@69
|
340
|
jpayne@69
|
341
|
jpayne@69
|
342
|
jpayne@69
|
343 //-- Process the input from <stdin> line by line
|
jpayne@69
|
344 PrevLine = NO_LINE;
|
jpayne@69
|
345 IdA[0] = '\0';
|
jpayne@69
|
346 IdB[0] = '\0';
|
jpayne@69
|
347 FrameA = 0;
|
jpayne@69
|
348 FrameB = 0;
|
jpayne@69
|
349 CurrFrameA = -1;
|
jpayne@69
|
350 CurrFrameB = -1;
|
jpayne@69
|
351 CurrIdB[0] = '\0';
|
jpayne@69
|
352 while ( fgets(Line, MAX_LINE, stdin) != NULL )
|
jpayne@69
|
353 {
|
jpayne@69
|
354
|
jpayne@69
|
355 //-- If the current line is a fasta HEADER_LINE
|
jpayne@69
|
356 if ( Line[0] == '>' )
|
jpayne@69
|
357 {
|
jpayne@69
|
358 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
|
jpayne@69
|
359 parseAbort ("stdin");
|
jpayne@69
|
360
|
jpayne@69
|
361 sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB);
|
jpayne@69
|
362 CurrIdB [strlen(CurrIdB) - 2] = '\0';
|
jpayne@69
|
363
|
jpayne@69
|
364 PrevLine = HEADER_LINE;
|
jpayne@69
|
365 }
|
jpayne@69
|
366
|
jpayne@69
|
367
|
jpayne@69
|
368 //-- If the current line is a cluster HEADER_LINE
|
jpayne@69
|
369 else if ( Line[0] == '#' )
|
jpayne@69
|
370 {
|
jpayne@69
|
371 PrevLine = HEADER_LINE;
|
jpayne@69
|
372 }
|
jpayne@69
|
373
|
jpayne@69
|
374
|
jpayne@69
|
375 //-- If the current line is a MATCH_LINE
|
jpayne@69
|
376 else
|
jpayne@69
|
377 {
|
jpayne@69
|
378 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
|
jpayne@69
|
379 parseAbort ("stdin");
|
jpayne@69
|
380
|
jpayne@69
|
381 //-- Re-map the reference coordinate back to its original sequence
|
jpayne@69
|
382 // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length
|
jpayne@69
|
383 // including the appended 'X' on each frame translation
|
jpayne@69
|
384 for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ )
|
jpayne@69
|
385 sA -= refLen (Af[Seqi].len);
|
jpayne@69
|
386
|
jpayne@69
|
387 assert ( Seqi < As );
|
jpayne@69
|
388
|
jpayne@69
|
389 //-- Get the correct frame, translate startA coordinate to frame
|
jpayne@69
|
390 for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ )
|
jpayne@69
|
391 sA -= (Af[Seqi].len - (i % 3)) / 3 + 1;
|
jpayne@69
|
392 CurrFrameA = i + 1;
|
jpayne@69
|
393 assert ( CurrFrameA > 0 && CurrFrameA < 7 );
|
jpayne@69
|
394
|
jpayne@69
|
395 //-- If the match spans across a frame boundry
|
jpayne@69
|
396 if ( CurrFrameA < 1 || CurrFrameA > 6 ||
|
jpayne@69
|
397 sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 ||
|
jpayne@69
|
398 sA <= 0 )
|
jpayne@69
|
399 {
|
jpayne@69
|
400 fprintf (stderr,
|
jpayne@69
|
401 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
|
jpayne@69
|
402 " Reference sequence '>%s', frame %d\n"
|
jpayne@69
|
403 "Please file a bug report\n"
|
jpayne@69
|
404 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA);
|
jpayne@69
|
405 continue;
|
jpayne@69
|
406 }
|
jpayne@69
|
407
|
jpayne@69
|
408 //-- If the match spans across a sequence boundry
|
jpayne@69
|
409 if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 )
|
jpayne@69
|
410 {
|
jpayne@69
|
411 fprintf (stderr,
|
jpayne@69
|
412 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
|
jpayne@69
|
413 " Reference sequence '>%s'\n"
|
jpayne@69
|
414 "Please file a bug report\n"
|
jpayne@69
|
415 "Attempting to continue.\n", Af[Seqi].Id);
|
jpayne@69
|
416 continue;
|
jpayne@69
|
417 }
|
jpayne@69
|
418
|
jpayne@69
|
419 //-- Check and update the current synteny region
|
jpayne@69
|
420 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ||
|
jpayne@69
|
421 CurrFrameA != FrameA || CurrFrameB != FrameB )
|
jpayne@69
|
422 {
|
jpayne@69
|
423 Found = false;
|
jpayne@69
|
424 if ( strcmp (IdB, CurrIdB) == 0 )
|
jpayne@69
|
425 {
|
jpayne@69
|
426 //-- Has this header been seen before?
|
jpayne@69
|
427 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
|
jpayne@69
|
428 {
|
jpayne@69
|
429 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
|
jpayne@69
|
430 {
|
jpayne@69
|
431 assert ( Sp->AfP->len == Af[Seqi].len );
|
jpayne@69
|
432 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
|
jpayne@69
|
433 CurrSp = Sp;
|
jpayne@69
|
434 Found = true;
|
jpayne@69
|
435 break;
|
jpayne@69
|
436 }
|
jpayne@69
|
437 }
|
jpayne@69
|
438 }
|
jpayne@69
|
439 else
|
jpayne@69
|
440 {
|
jpayne@69
|
441 //-- New B sequence header, process all the old synteny's
|
jpayne@69
|
442 processSyntenys (Syntenys, Af, As,
|
jpayne@69
|
443 QryFile, ClusterFile, DeltaFile);
|
jpayne@69
|
444
|
jpayne@69
|
445 }
|
jpayne@69
|
446
|
jpayne@69
|
447 strcpy (IdA, Af[Seqi].Id);
|
jpayne@69
|
448 strcpy (IdB, CurrIdB);
|
jpayne@69
|
449 FrameA = CurrFrameA;
|
jpayne@69
|
450 FrameB = CurrFrameB;
|
jpayne@69
|
451
|
jpayne@69
|
452 //-- If not seen yet, create a new synteny region
|
jpayne@69
|
453 if ( ! Found )
|
jpayne@69
|
454 {
|
jpayne@69
|
455 Asyn.AfP = Af + Seqi;
|
jpayne@69
|
456 Asyn.Bf.len = -1;
|
jpayne@69
|
457 Asyn.Bf.Id = (char *) Safe_malloc
|
jpayne@69
|
458 (sizeof(char) * (strlen(IdB) + 1));
|
jpayne@69
|
459 strcpy (Asyn.Bf.Id, IdB);
|
jpayne@69
|
460
|
jpayne@69
|
461 Syntenys.push_back (Asyn);
|
jpayne@69
|
462 CurrSp = Syntenys.rbegin( );
|
jpayne@69
|
463 }
|
jpayne@69
|
464
|
jpayne@69
|
465 //-- Add a new cluster to the current synteny
|
jpayne@69
|
466 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
467 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
468 CurrSp->clusters.pop_back( ); // hack to remove empties
|
jpayne@69
|
469 Aclu.frameA = FrameA;
|
jpayne@69
|
470 Aclu.frameB = FrameB;
|
jpayne@69
|
471 Aclu.wasFused = false;
|
jpayne@69
|
472 CurrSp->clusters.push_back (Aclu);
|
jpayne@69
|
473 }
|
jpayne@69
|
474 else if ( PrevLine == HEADER_LINE )
|
jpayne@69
|
475 {
|
jpayne@69
|
476 //-- Add a new cluster to the current synteny
|
jpayne@69
|
477 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
478 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
479 CurrSp->clusters.pop_back( );
|
jpayne@69
|
480 Aclu.frameA = FrameA;
|
jpayne@69
|
481 Aclu.frameB = FrameB;
|
jpayne@69
|
482 Aclu.wasFused = false;
|
jpayne@69
|
483 CurrSp->clusters.push_back (Aclu);
|
jpayne@69
|
484 }
|
jpayne@69
|
485
|
jpayne@69
|
486 //-- Add a new match to the current cluster
|
jpayne@69
|
487 // NOTE: A and B coordinates still reference the appropriate
|
jpayne@69
|
488 // amino acid sequence frame, not the DNA (same with len)
|
jpayne@69
|
489 if ( len > 1 )
|
jpayne@69
|
490 {
|
jpayne@69
|
491 Amat.sA = sA;
|
jpayne@69
|
492 Amat.sB = sB;
|
jpayne@69
|
493 Amat.len = len;
|
jpayne@69
|
494 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
|
jpayne@69
|
495 }
|
jpayne@69
|
496
|
jpayne@69
|
497 PrevLine = MATCH_LINE;
|
jpayne@69
|
498 }
|
jpayne@69
|
499 }
|
jpayne@69
|
500
|
jpayne@69
|
501 //-- Process the left-over syntenys
|
jpayne@69
|
502 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
|
jpayne@69
|
503 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
|
jpayne@69
|
504 CurrSp->clusters.pop_back( );
|
jpayne@69
|
505 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
|
jpayne@69
|
506 fclose (QryFile);
|
jpayne@69
|
507
|
jpayne@69
|
508 //-- Free the reference sequences
|
jpayne@69
|
509 for ( i = 0; i < As; i ++ )
|
jpayne@69
|
510 {
|
jpayne@69
|
511 free (Af[i].Id);
|
jpayne@69
|
512 free (Af[i].seq);
|
jpayne@69
|
513 }
|
jpayne@69
|
514 free (Af);
|
jpayne@69
|
515
|
jpayne@69
|
516 return EXIT_SUCCESS;
|
jpayne@69
|
517 }
|
jpayne@69
|
518
|
jpayne@69
|
519
|
jpayne@69
|
520
|
jpayne@69
|
521
|
jpayne@69
|
522 void addNewAlignment
|
jpayne@69
|
523 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
|
jpayne@69
|
524 vector<Match>::iterator Mp)
|
jpayne@69
|
525
|
jpayne@69
|
526 // Create and add a new alignment object based on the current match
|
jpayne@69
|
527 // and cluster information pointed to by Cp and Mp.
|
jpayne@69
|
528
|
jpayne@69
|
529 {
|
jpayne@69
|
530 Alignment Align;
|
jpayne@69
|
531
|
jpayne@69
|
532 //-- Initialize the data
|
jpayne@69
|
533 Align.sA = Mp->sA;
|
jpayne@69
|
534 Align.sB = Mp->sB;
|
jpayne@69
|
535 Align.eA = Mp->sA + Mp->len - 1;
|
jpayne@69
|
536 Align.eB = Mp->sB + Mp->len - 1;
|
jpayne@69
|
537 Align.frameA = Cp->frameA;
|
jpayne@69
|
538 Align.frameB = Cp->frameB;
|
jpayne@69
|
539 Align.delta.clear( );
|
jpayne@69
|
540 Align.deltaApos = 0;
|
jpayne@69
|
541
|
jpayne@69
|
542 //-- Add the alignment object
|
jpayne@69
|
543 Alignments.push_back (Align);
|
jpayne@69
|
544
|
jpayne@69
|
545 return;
|
jpayne@69
|
546 }
|
jpayne@69
|
547
|
jpayne@69
|
548
|
jpayne@69
|
549
|
jpayne@69
|
550
|
jpayne@69
|
551 bool extendBackward
|
jpayne@69
|
552 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
|
jpayne@69
|
553 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
|
jpayne@69
|
554
|
jpayne@69
|
555 // Extend an alignment backwards off of the current alignment object.
|
jpayne@69
|
556 // The current alignment object must be freshly created and consist
|
jpayne@69
|
557 // only of an exact match (i.e. the delta vector MUST be empty).
|
jpayne@69
|
558 // If the TargetAp alignment object is reached by the extension, it will
|
jpayne@69
|
559 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
|
jpayne@69
|
560 // NULL the function will extend as far as possible. It is a strange
|
jpayne@69
|
561 // and dangerous function because it can delete CurrAp, so edit with
|
jpayne@69
|
562 // caution. Returns true if TargetAp was reached and merged, else false
|
jpayne@69
|
563 // Designed only as a subroutine for extendClusters, should be used
|
jpayne@69
|
564 // nowhere else.
|
jpayne@69
|
565
|
jpayne@69
|
566 {
|
jpayne@69
|
567 bool target_reached = false;
|
jpayne@69
|
568 bool overflow_flag = false;
|
jpayne@69
|
569 bool double_flag = false;
|
jpayne@69
|
570
|
jpayne@69
|
571 vector<long int>::iterator Dp;
|
jpayne@69
|
572
|
jpayne@69
|
573 unsigned int m_o;
|
jpayne@69
|
574 long int targetA, targetB;
|
jpayne@69
|
575
|
jpayne@69
|
576 m_o = BACKWARD_SEARCH;
|
jpayne@69
|
577
|
jpayne@69
|
578 //-- Set the target coordinates
|
jpayne@69
|
579 if ( TargetAp != Alignments.end( ) )
|
jpayne@69
|
580 {
|
jpayne@69
|
581 targetA = TargetAp->eA;
|
jpayne@69
|
582 targetB = TargetAp->eB;
|
jpayne@69
|
583 }
|
jpayne@69
|
584 else
|
jpayne@69
|
585 {
|
jpayne@69
|
586 targetA = 1;
|
jpayne@69
|
587 targetB = 1;
|
jpayne@69
|
588 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
589 }
|
jpayne@69
|
590
|
jpayne@69
|
591 //-- If alignment is too long, bring with bounds and set overflow_flag true
|
jpayne@69
|
592 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
593 {
|
jpayne@69
|
594 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
|
jpayne@69
|
595 overflow_flag = true;
|
jpayne@69
|
596 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
597 }
|
jpayne@69
|
598 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
599 {
|
jpayne@69
|
600 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
|
jpayne@69
|
601 if ( overflow_flag )
|
jpayne@69
|
602 double_flag = true;
|
jpayne@69
|
603 else
|
jpayne@69
|
604 overflow_flag = true;
|
jpayne@69
|
605 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
606 }
|
jpayne@69
|
607
|
jpayne@69
|
608
|
jpayne@69
|
609 if ( TO_SEQEND && !double_flag )
|
jpayne@69
|
610 m_o |= SEQEND_BIT;
|
jpayne@69
|
611
|
jpayne@69
|
612
|
jpayne@69
|
613 //-- Attempt to reach the target
|
jpayne@69
|
614 target_reached = alignSearch (A, CurrAp->sA, targetA,
|
jpayne@69
|
615 B, CurrAp->sB, targetB, m_o);
|
jpayne@69
|
616
|
jpayne@69
|
617 if ( overflow_flag || TargetAp == Alignments.end( ) )
|
jpayne@69
|
618 target_reached = false;
|
jpayne@69
|
619
|
jpayne@69
|
620 if ( target_reached )
|
jpayne@69
|
621 {
|
jpayne@69
|
622 //-- assert that CurrAp is new and at the end of the Alignment vector
|
jpayne@69
|
623 assert ( CurrAp->delta.empty( ) );
|
jpayne@69
|
624 assert ( CurrAp == Alignments.end( ) - 1 );
|
jpayne@69
|
625
|
jpayne@69
|
626 //-- Merge the two alignment objects
|
jpayne@69
|
627 extendForward (TargetAp, A, CurrAp->sA,
|
jpayne@69
|
628 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
|
jpayne@69
|
629 TargetAp->eA = CurrAp->eA;
|
jpayne@69
|
630 TargetAp->eB = CurrAp->eB;
|
jpayne@69
|
631 Alignments.pop_back( );
|
jpayne@69
|
632 }
|
jpayne@69
|
633 else
|
jpayne@69
|
634 {
|
jpayne@69
|
635 alignTarget (A, targetA, CurrAp->sA,
|
jpayne@69
|
636 B, targetB, CurrAp->sB,
|
jpayne@69
|
637 CurrAp->delta, FORCED_FORWARD_ALIGN);
|
jpayne@69
|
638 CurrAp->sA = targetA;
|
jpayne@69
|
639 CurrAp->sB = targetB;
|
jpayne@69
|
640
|
jpayne@69
|
641 //-- Update the deltaApos value for the alignment object
|
jpayne@69
|
642 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
|
jpayne@69
|
643 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
|
jpayne@69
|
644 }
|
jpayne@69
|
645
|
jpayne@69
|
646 return target_reached;
|
jpayne@69
|
647 }
|
jpayne@69
|
648
|
jpayne@69
|
649
|
jpayne@69
|
650
|
jpayne@69
|
651
|
jpayne@69
|
652 bool extendForward
|
jpayne@69
|
653 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
|
jpayne@69
|
654 const char * B, long int targetB, unsigned int m_o)
|
jpayne@69
|
655
|
jpayne@69
|
656 // Extend an alignment forwards off the current alignment object until
|
jpayne@69
|
657 // target or end of sequence is reached, and merge the delta values of the
|
jpayne@69
|
658 // alignment object with the new delta values generated by the extension.
|
jpayne@69
|
659 // Return true if the target was reached, else false
|
jpayne@69
|
660
|
jpayne@69
|
661 {
|
jpayne@69
|
662 long int ValA;
|
jpayne@69
|
663 long int ValB;
|
jpayne@69
|
664 unsigned int Di;
|
jpayne@69
|
665 bool target_reached;
|
jpayne@69
|
666 bool overflow_flag = false;
|
jpayne@69
|
667 bool double_flag = false;
|
jpayne@69
|
668 vector<long int>::iterator Dp;
|
jpayne@69
|
669
|
jpayne@69
|
670 //-- Set Di to the end of the delta vector
|
jpayne@69
|
671 Di = CurrAp->delta.size( );
|
jpayne@69
|
672
|
jpayne@69
|
673 //-- Calculate the distance between the start and end positions
|
jpayne@69
|
674 ValA = targetA - CurrAp->eA + 1;
|
jpayne@69
|
675 ValB = targetB - CurrAp->eB + 1;
|
jpayne@69
|
676
|
jpayne@69
|
677 //-- If the distance is too long, shrink it and set the overflow_flag
|
jpayne@69
|
678 if ( ValA > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
679 {
|
jpayne@69
|
680 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
|
jpayne@69
|
681 overflow_flag = true;
|
jpayne@69
|
682 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
683 }
|
jpayne@69
|
684 if ( ValB > MAX_ALIGNMENT_LENGTH )
|
jpayne@69
|
685 {
|
jpayne@69
|
686 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
|
jpayne@69
|
687 if ( overflow_flag )
|
jpayne@69
|
688 double_flag = true;
|
jpayne@69
|
689 else
|
jpayne@69
|
690 overflow_flag = true;
|
jpayne@69
|
691 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
692 }
|
jpayne@69
|
693
|
jpayne@69
|
694 if ( double_flag )
|
jpayne@69
|
695 m_o &= ~SEQEND_BIT;
|
jpayne@69
|
696
|
jpayne@69
|
697 target_reached = alignTarget (A, CurrAp->eA, targetA,
|
jpayne@69
|
698 B, CurrAp->eB, targetB,
|
jpayne@69
|
699 CurrAp->delta, m_o);
|
jpayne@69
|
700
|
jpayne@69
|
701 //-- Notify user if alignment was chopped short
|
jpayne@69
|
702 if ( target_reached && overflow_flag )
|
jpayne@69
|
703 target_reached = false;
|
jpayne@69
|
704
|
jpayne@69
|
705 //-- Pick up delta where left off (Di) and merge with new deltas
|
jpayne@69
|
706 if ( Di < CurrAp->delta.size( ) )
|
jpayne@69
|
707 {
|
jpayne@69
|
708 //-- Merge the deltas
|
jpayne@69
|
709 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
|
jpayne@69
|
710 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
|
jpayne@69
|
711 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
|
jpayne@69
|
712 {
|
jpayne@69
|
713 fprintf(stderr,
|
jpayne@69
|
714 "ERROR: failed to merge alignments at position %ld\n"
|
jpayne@69
|
715 " Please file a bug report\n",
|
jpayne@69
|
716 CurrAp->eA);
|
jpayne@69
|
717 exit (EXIT_FAILURE);
|
jpayne@69
|
718 }
|
jpayne@69
|
719
|
jpayne@69
|
720 //-- Update the deltaApos
|
jpayne@69
|
721 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
|
jpayne@69
|
722 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
|
jpayne@69
|
723 }
|
jpayne@69
|
724
|
jpayne@69
|
725 //-- Set the alignment coordinates
|
jpayne@69
|
726 CurrAp->eA = targetA;
|
jpayne@69
|
727 CurrAp->eB = targetB;
|
jpayne@69
|
728
|
jpayne@69
|
729 return target_reached;
|
jpayne@69
|
730 }
|
jpayne@69
|
731
|
jpayne@69
|
732
|
jpayne@69
|
733
|
jpayne@69
|
734
|
jpayne@69
|
735 void extendClusters
|
jpayne@69
|
736 (vector<Cluster> & Clusters,
|
jpayne@69
|
737 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
|
jpayne@69
|
738
|
jpayne@69
|
739 // Connect all the matches in every cluster between sequences A and B.
|
jpayne@69
|
740 // Also, extend alignments off of the front and back of each cluster to
|
jpayne@69
|
741 // expand total alignment coverage. When these extensions encounter an
|
jpayne@69
|
742 // adjacent cluster (in a consistent frame), fuse the two regions to
|
jpayne@69
|
743 // create one single encompassing region. This routine will create
|
jpayne@69
|
744 // alignment objects from these extensions and output the resulting
|
jpayne@69
|
745 // delta information to the delta output file. Also, all coordintes
|
jpayne@69
|
746 // for the clusters and the alignment regions are translated to reference
|
jpayne@69
|
747 // the original DNA sequecne.
|
jpayne@69
|
748
|
jpayne@69
|
749 {
|
jpayne@69
|
750 //-- Sort the clusters (ascending) by their start coordinate in sequence A
|
jpayne@69
|
751 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
|
jpayne@69
|
752
|
jpayne@69
|
753 //-- If no delta file is requested
|
jpayne@69
|
754 if ( ! DO_DELTA )
|
jpayne@69
|
755 return;
|
jpayne@69
|
756
|
jpayne@69
|
757
|
jpayne@69
|
758 bool target_reached = false; // reached the adjacent match or cluster
|
jpayne@69
|
759 // the amino acid sequences for A and B
|
jpayne@69
|
760 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
761 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
762
|
jpayne@69
|
763 long int Alen [7], Blen [7]; // the length of the amino acid sequences
|
jpayne@69
|
764 int i;
|
jpayne@69
|
765
|
jpayne@69
|
766 unsigned int m_o;
|
jpayne@69
|
767 long int targetA, targetB; // alignment extension targets in A and B
|
jpayne@69
|
768
|
jpayne@69
|
769 vector<Match>::iterator Mp; // match pointer
|
jpayne@69
|
770
|
jpayne@69
|
771 vector<Cluster>::iterator PrevCp; // where the extensions last left off
|
jpayne@69
|
772 vector<Cluster>::iterator CurrCp; // the current cluster being extended
|
jpayne@69
|
773 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
|
jpayne@69
|
774
|
jpayne@69
|
775 vector<Alignment> Alignments; // the vector of alignment objects
|
jpayne@69
|
776 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
|
jpayne@69
|
777 vector<Alignment>::iterator TargetAp; // target align
|
jpayne@69
|
778
|
jpayne@69
|
779 //-- Calculate the length of each amino acid frame translation
|
jpayne@69
|
780 for ( i = 0; i < 6; i ++ )
|
jpayne@69
|
781 {
|
jpayne@69
|
782 Alen [i + 1] = (Af->len - (i % 3)) / 3;
|
jpayne@69
|
783 Blen [i + 1] = (Bf->len - (i % 3)) / 3;
|
jpayne@69
|
784 }
|
jpayne@69
|
785
|
jpayne@69
|
786 //-- Extend each cluster
|
jpayne@69
|
787 PrevCp = Clusters.begin( );
|
jpayne@69
|
788 CurrCp = Clusters.begin( );
|
jpayne@69
|
789 while ( CurrCp < Clusters.end( ) )
|
jpayne@69
|
790 {
|
jpayne@69
|
791 if ( DO_EXTEND )
|
jpayne@69
|
792 {
|
jpayne@69
|
793 //-- Ignore if shadowed or already extended
|
jpayne@69
|
794 if ( ! target_reached )
|
jpayne@69
|
795 if ( CurrCp->wasFused ||
|
jpayne@69
|
796 isShadowedCluster (CurrCp, Alignments, CurrAp) )
|
jpayne@69
|
797 {
|
jpayne@69
|
798 CurrCp->wasFused = true;
|
jpayne@69
|
799 CurrCp = ++ PrevCp;
|
jpayne@69
|
800 continue;
|
jpayne@69
|
801 }
|
jpayne@69
|
802 }
|
jpayne@69
|
803
|
jpayne@69
|
804 //-- Initialize the right amino acid sequence for A and B if need be
|
jpayne@69
|
805 if ( A [CurrCp->frameA] == NULL )
|
jpayne@69
|
806 {
|
jpayne@69
|
807 A [CurrCp->frameA] = (char *) Safe_malloc
|
jpayne@69
|
808 ( sizeof(char) * ( (Af->len / 3) + 2) );
|
jpayne@69
|
809 A [CurrCp->frameA] [0] = '\0';
|
jpayne@69
|
810 Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA],
|
jpayne@69
|
811 CurrCp->frameA );
|
jpayne@69
|
812 }
|
jpayne@69
|
813 if ( B [CurrCp->frameB] == NULL )
|
jpayne@69
|
814 {
|
jpayne@69
|
815 B [CurrCp->frameB] = (char *) Safe_malloc
|
jpayne@69
|
816 ( sizeof(char) * ( (Bf->len / 3) + 2) );
|
jpayne@69
|
817 B [CurrCp->frameB] [0] = '\0';
|
jpayne@69
|
818 Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB],
|
jpayne@69
|
819 CurrCp->frameB );
|
jpayne@69
|
820 }
|
jpayne@69
|
821
|
jpayne@69
|
822 //-- Extend each match in the cluster
|
jpayne@69
|
823 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
|
jpayne@69
|
824 {
|
jpayne@69
|
825 //-- Try to extend the current match backwards
|
jpayne@69
|
826 if ( target_reached )
|
jpayne@69
|
827 {
|
jpayne@69
|
828 //-- Merge with the previous match
|
jpayne@69
|
829 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
|
jpayne@69
|
830 {
|
jpayne@69
|
831 //-- Strip matches until the targeted match is found
|
jpayne@69
|
832 assert (Mp < CurrCp->matches.end( ) - 1);
|
jpayne@69
|
833 continue;
|
jpayne@69
|
834 }
|
jpayne@69
|
835
|
jpayne@69
|
836 CurrAp->eA += Mp->len - 1;
|
jpayne@69
|
837 CurrAp->eB += Mp->len - 1;
|
jpayne@69
|
838 }
|
jpayne@69
|
839 else
|
jpayne@69
|
840 {
|
jpayne@69
|
841 //-- Create a new alignment object
|
jpayne@69
|
842 addNewAlignment (Alignments, CurrCp, Mp);
|
jpayne@69
|
843 CurrAp = Alignments.end( ) - 1;
|
jpayne@69
|
844
|
jpayne@69
|
845 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
|
jpayne@69
|
846 {
|
jpayne@69
|
847 //-- Target the closest/best alignment object
|
jpayne@69
|
848 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
|
jpayne@69
|
849
|
jpayne@69
|
850 //-- Extend the new alignment object backwards
|
jpayne@69
|
851 if ( extendBackward (Alignments, CurrAp, TargetAp,
|
jpayne@69
|
852 A [CurrCp->frameA], B [CurrCp->frameB]) )
|
jpayne@69
|
853 CurrAp = TargetAp;
|
jpayne@69
|
854 }
|
jpayne@69
|
855 }
|
jpayne@69
|
856
|
jpayne@69
|
857 m_o = FORWARD_ALIGN;
|
jpayne@69
|
858
|
jpayne@69
|
859 //-- Try to extend the current match forwards
|
jpayne@69
|
860 if ( Mp < CurrCp->matches.end( ) - 1 )
|
jpayne@69
|
861 {
|
jpayne@69
|
862 //-- Target the next match in the cluster
|
jpayne@69
|
863 targetA = (Mp + 1)->sA;
|
jpayne@69
|
864 targetB = (Mp + 1)->sB;
|
jpayne@69
|
865
|
jpayne@69
|
866 //-- Extend the current alignment object forward
|
jpayne@69
|
867 target_reached = extendForward (CurrAp,
|
jpayne@69
|
868 A [CurrCp->frameA], targetA,
|
jpayne@69
|
869 B [CurrCp->frameB], targetB, m_o);
|
jpayne@69
|
870 }
|
jpayne@69
|
871 else if ( DO_EXTEND )
|
jpayne@69
|
872 {
|
jpayne@69
|
873 targetA = Alen [CurrCp->frameA];
|
jpayne@69
|
874 targetB = Blen [CurrCp->frameB];
|
jpayne@69
|
875
|
jpayne@69
|
876 //-- Target the closest/best match in a future cluster
|
jpayne@69
|
877 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
|
jpayne@69
|
878 targetA, targetB);
|
jpayne@69
|
879
|
jpayne@69
|
880 if ( TargetCp == Clusters.end( ) )
|
jpayne@69
|
881 {
|
jpayne@69
|
882 m_o |= OPTIMAL_BIT;
|
jpayne@69
|
883 if ( TO_SEQEND )
|
jpayne@69
|
884 m_o |= SEQEND_BIT;
|
jpayne@69
|
885 }
|
jpayne@69
|
886
|
jpayne@69
|
887 //-- Extend the current alignment object forward
|
jpayne@69
|
888 target_reached = extendForward (CurrAp,
|
jpayne@69
|
889 A [CurrCp->frameA], targetA,
|
jpayne@69
|
890 B [CurrCp->frameB], targetB, m_o);
|
jpayne@69
|
891 }
|
jpayne@69
|
892 }
|
jpayne@69
|
893
|
jpayne@69
|
894 if ( TargetCp == Clusters.end( ) )
|
jpayne@69
|
895 target_reached = false;
|
jpayne@69
|
896
|
jpayne@69
|
897 CurrCp->wasFused = true;
|
jpayne@69
|
898
|
jpayne@69
|
899 if ( target_reached == false )
|
jpayne@69
|
900 CurrCp = ++ PrevCp;
|
jpayne@69
|
901 else
|
jpayne@69
|
902 CurrCp = TargetCp;
|
jpayne@69
|
903 }
|
jpayne@69
|
904
|
jpayne@69
|
905 #ifdef _DEBUG_ASSERT
|
jpayne@69
|
906 validateData (Alignments, Clusters, Af, Bf);
|
jpayne@69
|
907 #endif
|
jpayne@69
|
908
|
jpayne@69
|
909 //-- Output the alignment data to the delta file
|
jpayne@69
|
910 flushAlignments (Alignments, Af, Bf, DeltaFile);
|
jpayne@69
|
911
|
jpayne@69
|
912 for ( i = 0; i < 7; i ++ )
|
jpayne@69
|
913 {
|
jpayne@69
|
914 if ( A [i] != NULL )
|
jpayne@69
|
915 free ( A [i] );
|
jpayne@69
|
916 if ( B [i] != NULL )
|
jpayne@69
|
917 free ( B [i] );
|
jpayne@69
|
918 }
|
jpayne@69
|
919
|
jpayne@69
|
920 return;
|
jpayne@69
|
921 }
|
jpayne@69
|
922
|
jpayne@69
|
923
|
jpayne@69
|
924
|
jpayne@69
|
925
|
jpayne@69
|
926 void flushAlignments
|
jpayne@69
|
927 (vector<Alignment> & Alignments,
|
jpayne@69
|
928 const FastaRecord * Af, const FastaRecord * Bf,
|
jpayne@69
|
929 FILE * DeltaFile)
|
jpayne@69
|
930
|
jpayne@69
|
931 // Simply output the delta information stored in Alignments to the
|
jpayne@69
|
932 // given delta file. Free the memory used by Alignments once the
|
jpayne@69
|
933 // data is successfully output to the file.
|
jpayne@69
|
934
|
jpayne@69
|
935 {
|
jpayne@69
|
936 vector<Alignment>::iterator Ap; // alignment pointer
|
jpayne@69
|
937 vector<long int>::iterator Dp; // delta pointer
|
jpayne@69
|
938
|
jpayne@69
|
939 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
|
jpayne@69
|
940
|
jpayne@69
|
941 //-- Generate the error counts
|
jpayne@69
|
942 parseDelta (Alignments, Af, Bf);
|
jpayne@69
|
943
|
jpayne@69
|
944 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
945 {
|
jpayne@69
|
946 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
|
jpayne@69
|
947 transC (Ap->sA, Ap->frameA, Af->len),
|
jpayne@69
|
948 transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2),
|
jpayne@69
|
949 transC (Ap->sB, Ap->frameB, Bf->len),
|
jpayne@69
|
950 transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2),
|
jpayne@69
|
951 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
|
jpayne@69
|
952
|
jpayne@69
|
953 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
|
jpayne@69
|
954 fprintf (DeltaFile, "%ld\n", *Dp);
|
jpayne@69
|
955 fprintf (DeltaFile, "0\n");
|
jpayne@69
|
956
|
jpayne@69
|
957 Ap->delta.clear( );
|
jpayne@69
|
958 }
|
jpayne@69
|
959
|
jpayne@69
|
960 Alignments.clear( );
|
jpayne@69
|
961
|
jpayne@69
|
962 return;
|
jpayne@69
|
963 }
|
jpayne@69
|
964
|
jpayne@69
|
965
|
jpayne@69
|
966
|
jpayne@69
|
967
|
jpayne@69
|
968 void flushSyntenys
|
jpayne@69
|
969 (vector<Synteny> & Syntenys, FILE * ClusterFile)
|
jpayne@69
|
970
|
jpayne@69
|
971 // Simply output the synteny/cluster information generated by the mgaps
|
jpayne@69
|
972 // program. However, now the coordinates reference their appropriate
|
jpayne@69
|
973 // reference sequence, and the reference sequecne header is added to
|
jpayne@69
|
974 // the appropriate lines. Free the memory used by Syntenys once the
|
jpayne@69
|
975 // data is successfully output to the file.
|
jpayne@69
|
976
|
jpayne@69
|
977 {
|
jpayne@69
|
978 vector<Synteny>::iterator Sp; // synteny pointer
|
jpayne@69
|
979 vector<Cluster>::iterator Cp; // cluster pointer
|
jpayne@69
|
980 vector<Match>::iterator Mp; // match pointer
|
jpayne@69
|
981
|
jpayne@69
|
982 if ( ClusterFile )
|
jpayne@69
|
983 {
|
jpayne@69
|
984 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
|
jpayne@69
|
985 {
|
jpayne@69
|
986 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
|
jpayne@69
|
987 Sp->AfP->len, Sp->Bf.len);
|
jpayne@69
|
988 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
|
jpayne@69
|
989 {
|
jpayne@69
|
990 fprintf (ClusterFile, "%2d %2d\n",
|
jpayne@69
|
991 Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA,
|
jpayne@69
|
992 Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB);
|
jpayne@69
|
993 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
|
jpayne@69
|
994 {
|
jpayne@69
|
995 fprintf (ClusterFile, "%8ld %8ld %6ld",
|
jpayne@69
|
996 transC (Mp->sA, Cp->frameA, Sp->AfP->len),
|
jpayne@69
|
997 transC (Mp->sB, Cp->frameB, Sp->Bf.len),
|
jpayne@69
|
998 Mp->len * 3);
|
jpayne@69
|
999
|
jpayne@69
|
1000 if ( Mp != Cp->matches.begin( ) )
|
jpayne@69
|
1001 fprintf (ClusterFile, "%6ld %6ld\n",
|
jpayne@69
|
1002 (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3,
|
jpayne@69
|
1003 (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3);
|
jpayne@69
|
1004 else
|
jpayne@69
|
1005 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
|
jpayne@69
|
1006 }
|
jpayne@69
|
1007 Cp->matches.clear( );
|
jpayne@69
|
1008 }
|
jpayne@69
|
1009 Sp->clusters.clear( );
|
jpayne@69
|
1010 }
|
jpayne@69
|
1011 }
|
jpayne@69
|
1012 else
|
jpayne@69
|
1013 {
|
jpayne@69
|
1014 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
|
jpayne@69
|
1015 {
|
jpayne@69
|
1016 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
|
jpayne@69
|
1017 {
|
jpayne@69
|
1018 Cp->matches.clear( );
|
jpayne@69
|
1019 }
|
jpayne@69
|
1020 Sp->clusters.clear( );
|
jpayne@69
|
1021 }
|
jpayne@69
|
1022 }
|
jpayne@69
|
1023
|
jpayne@69
|
1024 Syntenys.clear( );
|
jpayne@69
|
1025
|
jpayne@69
|
1026 return;
|
jpayne@69
|
1027 }
|
jpayne@69
|
1028
|
jpayne@69
|
1029
|
jpayne@69
|
1030
|
jpayne@69
|
1031
|
jpayne@69
|
1032
|
jpayne@69
|
1033 vector<Cluster>::iterator getForwardTargetCluster
|
jpayne@69
|
1034 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
1035 long int & targetA, long int & targetB)
|
jpayne@69
|
1036
|
jpayne@69
|
1037 // Return the cluster that is most likely to successfully join (in a
|
jpayne@69
|
1038 // forward direction) with the current cluster. The returned cluster
|
jpayne@69
|
1039 // must contain 1 or more matches that are strictly greater than the end
|
jpayne@69
|
1040 // of the current cluster. The targeted cluster must also be on a
|
jpayne@69
|
1041 // diagonal close enough to the current cluster, so that a connection
|
jpayne@69
|
1042 // could possibly be made by the alignment extender. Also, this targeted
|
jpayne@69
|
1043 // cluster must be consistent in frame with the current cluster. Assumes
|
jpayne@69
|
1044 // clusters have been sorted via AscendingClusterSort. Returns targeted
|
jpayne@69
|
1045 // cluster and stores the target coordinates in targetA and targetB. If no
|
jpayne@69
|
1046 // suitable cluster was found, the function will return NULL and target
|
jpayne@69
|
1047 // A and targetB will remain unchanged.
|
jpayne@69
|
1048
|
jpayne@69
|
1049 {
|
jpayne@69
|
1050 vector<Match>::iterator Mip; // match iteratrive pointer
|
jpayne@69
|
1051 vector<Cluster>::iterator Cp; // cluster pointer
|
jpayne@69
|
1052 vector<Cluster>::iterator Cip; // cluster iterative pointer
|
jpayne@69
|
1053 long int eA, eB; // possible target
|
jpayne@69
|
1054 long int greater, lesser; // gap sizes between two clusters
|
jpayne@69
|
1055 long int sA = CurrCp->matches.rbegin( )->sA +
|
jpayne@69
|
1056 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
|
jpayne@69
|
1057 long int sB = CurrCp->matches.rbegin( )->sB +
|
jpayne@69
|
1058 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
|
jpayne@69
|
1059
|
jpayne@69
|
1060 //-- End of sequences is the default target, set distance accordingly
|
jpayne@69
|
1061 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
|
jpayne@69
|
1062
|
jpayne@69
|
1063 //-- For all clusters greater than the current cluster (on sequence A)
|
jpayne@69
|
1064 Cp = Clusters.end( );
|
jpayne@69
|
1065 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
|
jpayne@69
|
1066 {
|
jpayne@69
|
1067 //-- If the cluster is in the same frame
|
jpayne@69
|
1068 if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB )
|
jpayne@69
|
1069 {
|
jpayne@69
|
1070 eA = Cip->matches.begin( )->sA;
|
jpayne@69
|
1071 eB = Cip->matches.begin( )->sB;
|
jpayne@69
|
1072
|
jpayne@69
|
1073 //-- If the cluster overlaps the current cluster, strip some matches
|
jpayne@69
|
1074 if ( ( eA < sA || eB < sB ) &&
|
jpayne@69
|
1075 Cip->matches.rbegin( )->sA >= sA &&
|
jpayne@69
|
1076 Cip->matches.rbegin( )->sB >= sB )
|
jpayne@69
|
1077 {
|
jpayne@69
|
1078 for ( Mip = Cip->matches.begin( );
|
jpayne@69
|
1079 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
|
jpayne@69
|
1080 Mip ++ )
|
jpayne@69
|
1081 {
|
jpayne@69
|
1082 eA = Mip->sA;
|
jpayne@69
|
1083 eB = Mip->sB;
|
jpayne@69
|
1084 }
|
jpayne@69
|
1085 }
|
jpayne@69
|
1086
|
jpayne@69
|
1087 //-- If the cluster is strictly greater than current cluster
|
jpayne@69
|
1088 if ( eA >= sA && eB >= sB )
|
jpayne@69
|
1089 {
|
jpayne@69
|
1090 if ( eA - sA > eB - sB )
|
jpayne@69
|
1091 {
|
jpayne@69
|
1092 greater = eA - sA;
|
jpayne@69
|
1093 lesser = eB - sB;
|
jpayne@69
|
1094 }
|
jpayne@69
|
1095 else
|
jpayne@69
|
1096 {
|
jpayne@69
|
1097 lesser = eA - sA;
|
jpayne@69
|
1098 greater = eB - sB;
|
jpayne@69
|
1099 }
|
jpayne@69
|
1100
|
jpayne@69
|
1101 //-- If the cluster is close enough
|
jpayne@69
|
1102 if ( greater <= getBreakLen( ) ||
|
jpayne@69
|
1103 (lesser) * GOOD_SCORE [getMatrixType( )] +
|
jpayne@69
|
1104 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
|
jpayne@69
|
1105 {
|
jpayne@69
|
1106 Cp = Cip;
|
jpayne@69
|
1107 targetA = eA;
|
jpayne@69
|
1108 targetB = eB;
|
jpayne@69
|
1109 break;
|
jpayne@69
|
1110 }
|
jpayne@69
|
1111 else if ( (greater << 1) - lesser < dist )
|
jpayne@69
|
1112 {
|
jpayne@69
|
1113 Cp = Cip;
|
jpayne@69
|
1114 targetA = eA;
|
jpayne@69
|
1115 targetB = eB;
|
jpayne@69
|
1116 dist = (greater << 1) - lesser;
|
jpayne@69
|
1117 }
|
jpayne@69
|
1118 }
|
jpayne@69
|
1119 }
|
jpayne@69
|
1120 }
|
jpayne@69
|
1121
|
jpayne@69
|
1122
|
jpayne@69
|
1123 return Cp;
|
jpayne@69
|
1124 }
|
jpayne@69
|
1125
|
jpayne@69
|
1126
|
jpayne@69
|
1127
|
jpayne@69
|
1128
|
jpayne@69
|
1129
|
jpayne@69
|
1130 vector<Alignment>::iterator getReverseTargetAlignment
|
jpayne@69
|
1131 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
|
jpayne@69
|
1132
|
jpayne@69
|
1133 // Return the alignment that is most likely to successfully join (in a
|
jpayne@69
|
1134 // reverse direction) with the current alignment. The returned alignment
|
jpayne@69
|
1135 // must be strictly less than the current cluster and be on a diagonal
|
jpayne@69
|
1136 // close enough to the current alignment, so that a connection
|
jpayne@69
|
1137 // could possibly be made by the alignment extender. Also, this targeted
|
jpayne@69
|
1138 // alignment must be consistent in frame with the current alignment.
|
jpayne@69
|
1139 // Assumes clusters have been sorted via AscendingClusterSort and
|
jpayne@69
|
1140 // processed in order, so therefore all alignments are in order by their
|
jpayne@69
|
1141 // start A coordinate.
|
jpayne@69
|
1142
|
jpayne@69
|
1143 {
|
jpayne@69
|
1144 vector<Alignment>::iterator Ap; // alignment pointer
|
jpayne@69
|
1145 vector<Alignment>::iterator Aip; // alignment iterative pointer
|
jpayne@69
|
1146 long int eA, eB; // possible targets
|
jpayne@69
|
1147 long int greater, lesser; // gap sizes between the two alignments
|
jpayne@69
|
1148 long int sA = CurrAp->sA; // the startA of the current alignment
|
jpayne@69
|
1149 long int sB = CurrAp->sB; // the startB of the current alignment
|
jpayne@69
|
1150
|
jpayne@69
|
1151 //-- Beginning of sequences is the default target, set distance accordingly
|
jpayne@69
|
1152 long int dist = (sA < sB ? sA : sB);
|
jpayne@69
|
1153
|
jpayne@69
|
1154 //-- For all alignments less than the current alignment (on sequence A)
|
jpayne@69
|
1155 Ap = Alignments.end( );
|
jpayne@69
|
1156 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
|
jpayne@69
|
1157 {
|
jpayne@69
|
1158 //-- If the alignment is on the same direction
|
jpayne@69
|
1159 if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB )
|
jpayne@69
|
1160 {
|
jpayne@69
|
1161 eA = Aip->eA;
|
jpayne@69
|
1162 eB = Aip->eB;
|
jpayne@69
|
1163
|
jpayne@69
|
1164 //-- If the alignment is strictly greater than current cluster
|
jpayne@69
|
1165 if ( eA <= sA && eB <= sB )
|
jpayne@69
|
1166 {
|
jpayne@69
|
1167 if ( sA - eA > sB - eB )
|
jpayne@69
|
1168 {
|
jpayne@69
|
1169 greater = sA - eA;
|
jpayne@69
|
1170 lesser = sB - eB;
|
jpayne@69
|
1171 }
|
jpayne@69
|
1172 else
|
jpayne@69
|
1173 {
|
jpayne@69
|
1174 lesser = sA - eA;
|
jpayne@69
|
1175 greater = sB - eB;
|
jpayne@69
|
1176 }
|
jpayne@69
|
1177
|
jpayne@69
|
1178 //-- If the cluster is close enough
|
jpayne@69
|
1179 if ( greater <= getBreakLen( ) ||
|
jpayne@69
|
1180 (lesser) * GOOD_SCORE [getMatrixType( )] +
|
jpayne@69
|
1181 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
|
jpayne@69
|
1182 {
|
jpayne@69
|
1183 Ap = Aip;
|
jpayne@69
|
1184 break;
|
jpayne@69
|
1185 }
|
jpayne@69
|
1186 else if ( (greater << 1) - lesser < dist )
|
jpayne@69
|
1187 {
|
jpayne@69
|
1188 Ap = Aip;
|
jpayne@69
|
1189 dist = (greater << 1) - lesser;
|
jpayne@69
|
1190 }
|
jpayne@69
|
1191 }
|
jpayne@69
|
1192 }
|
jpayne@69
|
1193 }
|
jpayne@69
|
1194
|
jpayne@69
|
1195
|
jpayne@69
|
1196 return Ap;
|
jpayne@69
|
1197 }
|
jpayne@69
|
1198
|
jpayne@69
|
1199
|
jpayne@69
|
1200
|
jpayne@69
|
1201
|
jpayne@69
|
1202
|
jpayne@69
|
1203 bool isShadowedCluster
|
jpayne@69
|
1204 (vector<Cluster>::iterator CurrCp,
|
jpayne@69
|
1205 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
|
jpayne@69
|
1206
|
jpayne@69
|
1207 // Check if the current cluster is shadowed by a previously produced
|
jpayne@69
|
1208 // alignment region. Return true if it is, else false.
|
jpayne@69
|
1209
|
jpayne@69
|
1210 {
|
jpayne@69
|
1211 vector<Alignment>::iterator Aip; // alignment pointer
|
jpayne@69
|
1212
|
jpayne@69
|
1213 long int sA = CurrCp->matches.begin( )->sA;
|
jpayne@69
|
1214 long int eA = CurrCp->matches.rbegin( )->sA +
|
jpayne@69
|
1215 CurrCp->matches.rbegin( )->len - 1;
|
jpayne@69
|
1216 long int sB = CurrCp->matches.begin( )->sB;
|
jpayne@69
|
1217 long int eB = CurrCp->matches.rbegin( )->sB +
|
jpayne@69
|
1218 CurrCp->matches.rbegin( )->len - 1;
|
jpayne@69
|
1219
|
jpayne@69
|
1220 if ( ! Alignments.empty( ) ) // if there are alignments to use
|
jpayne@69
|
1221 {
|
jpayne@69
|
1222 //-- Look backwards in hope of finding a shadowing alignment
|
jpayne@69
|
1223 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
|
jpayne@69
|
1224 {
|
jpayne@69
|
1225 //-- If in the same frames and shadowing the current cluster, break
|
jpayne@69
|
1226 if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB )
|
jpayne@69
|
1227 if ( Aip->eA >= eA && Aip->eB >= eB )
|
jpayne@69
|
1228 if ( Aip->sA <= sA && Aip->sB <= sB )
|
jpayne@69
|
1229 break;
|
jpayne@69
|
1230 }
|
jpayne@69
|
1231
|
jpayne@69
|
1232 //-- Return true if the loop was broken, i.e. shadow found
|
jpayne@69
|
1233 if ( Aip >= Alignments.begin( ) )
|
jpayne@69
|
1234 return true;
|
jpayne@69
|
1235 }
|
jpayne@69
|
1236
|
jpayne@69
|
1237 //-- Return false if Alignments was empty or loop was not broken
|
jpayne@69
|
1238 return false;
|
jpayne@69
|
1239 }
|
jpayne@69
|
1240
|
jpayne@69
|
1241
|
jpayne@69
|
1242
|
jpayne@69
|
1243
|
jpayne@69
|
1244
|
jpayne@69
|
1245
|
jpayne@69
|
1246 void parseAbort
|
jpayne@69
|
1247 (const char * s)
|
jpayne@69
|
1248
|
jpayne@69
|
1249 // Abort the program if there was an error in parsing file 's'
|
jpayne@69
|
1250
|
jpayne@69
|
1251 {
|
jpayne@69
|
1252 fprintf (stderr,
|
jpayne@69
|
1253 "ERROR: Could not parse input from '%s'. \n"
|
jpayne@69
|
1254 "Please check the filename and format, or file a bug report\n", s);
|
jpayne@69
|
1255 exit (EXIT_FAILURE);
|
jpayne@69
|
1256 }
|
jpayne@69
|
1257
|
jpayne@69
|
1258
|
jpayne@69
|
1259
|
jpayne@69
|
1260
|
jpayne@69
|
1261 void parseDelta
|
jpayne@69
|
1262 (vector<Alignment> & Alignments,
|
jpayne@69
|
1263 const FastaRecord * Af, const FastaRecord *Bf)
|
jpayne@69
|
1264
|
jpayne@69
|
1265 // Use the delta information to generate the error counts for each
|
jpayne@69
|
1266 // alignment, and fill this information into the data type
|
jpayne@69
|
1267
|
jpayne@69
|
1268 {
|
jpayne@69
|
1269 // pointers to Af.seq and Bf.seq
|
jpayne@69
|
1270 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
1271 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
1272
|
jpayne@69
|
1273 char ch1, ch2;
|
jpayne@69
|
1274 long int Delta;
|
jpayne@69
|
1275 int Sign;
|
jpayne@69
|
1276 long int i, Apos, Bpos;
|
jpayne@69
|
1277 long int Remain, Total;
|
jpayne@69
|
1278 long int Errors, SimErrors;
|
jpayne@69
|
1279 long int NonAlphas;
|
jpayne@69
|
1280 vector<Alignment>::iterator Ap;
|
jpayne@69
|
1281 vector<long int>::iterator Dp;
|
jpayne@69
|
1282
|
jpayne@69
|
1283 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1284 {
|
jpayne@69
|
1285 if ( A [Ap->frameA] == NULL )
|
jpayne@69
|
1286 {
|
jpayne@69
|
1287 A [Ap->frameA] = (char *) Safe_malloc
|
jpayne@69
|
1288 ( sizeof(char) * ( (Af->len / 3) + 2 ) );
|
jpayne@69
|
1289 A [Ap->frameA] [0] = '\0';
|
jpayne@69
|
1290 Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA );
|
jpayne@69
|
1291 }
|
jpayne@69
|
1292 if ( B [Ap->frameB] == NULL )
|
jpayne@69
|
1293 {
|
jpayne@69
|
1294 B [Ap->frameB] = (char *) Safe_malloc
|
jpayne@69
|
1295 ( sizeof(char) * ( (Bf->len / 3) + 2) );
|
jpayne@69
|
1296 B [Ap->frameB] [0] = '\0';
|
jpayne@69
|
1297 Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB );
|
jpayne@69
|
1298 }
|
jpayne@69
|
1299
|
jpayne@69
|
1300 Apos = Ap->sA;
|
jpayne@69
|
1301 Bpos = Ap->sB;
|
jpayne@69
|
1302
|
jpayne@69
|
1303 Errors = 0;
|
jpayne@69
|
1304 SimErrors = 0;
|
jpayne@69
|
1305 NonAlphas = 0;
|
jpayne@69
|
1306 Remain = Ap->eA - Ap->sA + 1;
|
jpayne@69
|
1307 Total = Remain;
|
jpayne@69
|
1308
|
jpayne@69
|
1309 //-- For all delta's in this alignment
|
jpayne@69
|
1310 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
|
jpayne@69
|
1311 {
|
jpayne@69
|
1312 Delta = *Dp;
|
jpayne@69
|
1313 Sign = Delta > 0 ? 1 : -1;
|
jpayne@69
|
1314 Delta = labs ( Delta );
|
jpayne@69
|
1315
|
jpayne@69
|
1316 //-- For all the bases before the next indel
|
jpayne@69
|
1317 for ( i = 1; i < Delta; i ++ )
|
jpayne@69
|
1318 {
|
jpayne@69
|
1319 ch1 = A [Ap->frameA] [Apos ++];
|
jpayne@69
|
1320 ch2 = B [Ap->frameB] [Bpos ++];
|
jpayne@69
|
1321
|
jpayne@69
|
1322 if ( !isalpha (ch1) )
|
jpayne@69
|
1323 {
|
jpayne@69
|
1324 ch1 = STOP_CHAR;
|
jpayne@69
|
1325 NonAlphas ++;
|
jpayne@69
|
1326 }
|
jpayne@69
|
1327 if ( !isalpha (ch2) )
|
jpayne@69
|
1328 {
|
jpayne@69
|
1329 ch2 = STOP_CHAR;
|
jpayne@69
|
1330 NonAlphas ++;
|
jpayne@69
|
1331 }
|
jpayne@69
|
1332
|
jpayne@69
|
1333 ch1 = toupper(ch1);
|
jpayne@69
|
1334 ch2 = toupper(ch2);
|
jpayne@69
|
1335 if ( 1 > MATCH_SCORE
|
jpayne@69
|
1336 [getMatrixType( )]
|
jpayne@69
|
1337 [ch1 - 'A']
|
jpayne@69
|
1338 [ch2 - 'A'] )
|
jpayne@69
|
1339 SimErrors ++;
|
jpayne@69
|
1340 if ( ch1 != ch2 )
|
jpayne@69
|
1341 Errors ++;
|
jpayne@69
|
1342 }
|
jpayne@69
|
1343
|
jpayne@69
|
1344 //-- Process the current indel
|
jpayne@69
|
1345 Remain -= i - 1;
|
jpayne@69
|
1346 Errors ++;
|
jpayne@69
|
1347 SimErrors ++;
|
jpayne@69
|
1348
|
jpayne@69
|
1349 if ( Sign == 1 )
|
jpayne@69
|
1350 {
|
jpayne@69
|
1351 if ( !isalpha (A [Ap->frameA] [Apos ++]) )
|
jpayne@69
|
1352 NonAlphas ++;
|
jpayne@69
|
1353 Remain --;
|
jpayne@69
|
1354 }
|
jpayne@69
|
1355 else
|
jpayne@69
|
1356 {
|
jpayne@69
|
1357 if ( !isalpha (B [Ap->frameB] [Bpos ++]) )
|
jpayne@69
|
1358 NonAlphas ++;
|
jpayne@69
|
1359 Total ++;
|
jpayne@69
|
1360 }
|
jpayne@69
|
1361 }
|
jpayne@69
|
1362
|
jpayne@69
|
1363 //-- For all the bases after the final indel
|
jpayne@69
|
1364 for ( i = 0; i < Remain; i ++ )
|
jpayne@69
|
1365 {
|
jpayne@69
|
1366 //-- Score character match and update error counters
|
jpayne@69
|
1367 ch1 = A [Ap->frameA] [Apos ++];
|
jpayne@69
|
1368 ch2 = B [Ap->frameB] [Bpos ++];
|
jpayne@69
|
1369
|
jpayne@69
|
1370 if ( !isalpha (ch1) )
|
jpayne@69
|
1371 {
|
jpayne@69
|
1372 ch1 = STOP_CHAR;
|
jpayne@69
|
1373 NonAlphas ++;
|
jpayne@69
|
1374 }
|
jpayne@69
|
1375 if ( !isalpha (ch2) )
|
jpayne@69
|
1376 {
|
jpayne@69
|
1377 ch2 = STOP_CHAR;
|
jpayne@69
|
1378 NonAlphas ++;
|
jpayne@69
|
1379 }
|
jpayne@69
|
1380
|
jpayne@69
|
1381 ch1 = toupper(ch1);
|
jpayne@69
|
1382 ch2 = toupper(ch2);
|
jpayne@69
|
1383 if ( 1 > MATCH_SCORE
|
jpayne@69
|
1384 [getMatrixType( )]
|
jpayne@69
|
1385 [ch1 - 'A']
|
jpayne@69
|
1386 [ch2 - 'A'] )
|
jpayne@69
|
1387 SimErrors ++;
|
jpayne@69
|
1388 if ( ch1 != ch2 )
|
jpayne@69
|
1389 Errors ++;
|
jpayne@69
|
1390 }
|
jpayne@69
|
1391
|
jpayne@69
|
1392 Ap->Errors = Errors;
|
jpayne@69
|
1393 Ap->SimErrors = SimErrors;
|
jpayne@69
|
1394 Ap->NonAlphas = NonAlphas;
|
jpayne@69
|
1395 }
|
jpayne@69
|
1396
|
jpayne@69
|
1397 for ( i = 1; i <= 6; i ++ )
|
jpayne@69
|
1398 {
|
jpayne@69
|
1399 if ( A [i] != NULL )
|
jpayne@69
|
1400 free ( A [i] );
|
jpayne@69
|
1401 A [i] = NULL;
|
jpayne@69
|
1402
|
jpayne@69
|
1403 if ( B [i] != NULL )
|
jpayne@69
|
1404 free ( B [i] );
|
jpayne@69
|
1405 B [i] = NULL;
|
jpayne@69
|
1406 }
|
jpayne@69
|
1407
|
jpayne@69
|
1408 return;
|
jpayne@69
|
1409 }
|
jpayne@69
|
1410
|
jpayne@69
|
1411
|
jpayne@69
|
1412
|
jpayne@69
|
1413
|
jpayne@69
|
1414 void processSyntenys
|
jpayne@69
|
1415 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
|
jpayne@69
|
1416 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
|
jpayne@69
|
1417
|
jpayne@69
|
1418 // For each syntenic region with clusters, read in the B sequence and
|
jpayne@69
|
1419 // extend the clusters to expand total alignment coverage. Clusters should
|
jpayne@69
|
1420 // still reference the amino acid concatenations, the translation back to
|
jpayne@69
|
1421 // DNA will be taken care of by the extendClusters function.
|
jpayne@69
|
1422 // processSyntenys should ONLY be called once all the clusters for
|
jpayne@69
|
1423 // the contained syntenic regions have been stored in the data structure.
|
jpayne@69
|
1424 // Frees the memory used by the the syntenic regions once the output of
|
jpayne@69
|
1425 // extendClusters and flushSyntenys has been produced.
|
jpayne@69
|
1426
|
jpayne@69
|
1427 {
|
jpayne@69
|
1428 FastaRecord Bf; // the current B sequence
|
jpayne@69
|
1429 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
|
jpayne@69
|
1430
|
jpayne@69
|
1431 long int InitSize = INIT_SIZE; // the initial memory capacity of B
|
jpayne@69
|
1432 vector<Synteny>::iterator CurrSp; // current synteny pointer
|
jpayne@69
|
1433
|
jpayne@69
|
1434 //-- Initialize the B sequence storage
|
jpayne@69
|
1435 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
|
jpayne@69
|
1436
|
jpayne@69
|
1437 //-- For all the contained syntenys
|
jpayne@69
|
1438 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
|
jpayne@69
|
1439 {
|
jpayne@69
|
1440 //-- If no clusters, ignore
|
jpayne@69
|
1441 if ( CurrSp->clusters.empty( ) )
|
jpayne@69
|
1442 continue;
|
jpayne@69
|
1443
|
jpayne@69
|
1444 //-- If a B sequence not seen yet, read it in
|
jpayne@69
|
1445 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
|
jpayne@69
|
1446 // ordered as output by mgaps, if they are not in order the program
|
jpayne@69
|
1447 // will fail. (All like tags must be adjacent and in the same order
|
jpayne@69
|
1448 // as the query file)
|
jpayne@69
|
1449 if ( CurrSp == Syntenys.begin( ) ||
|
jpayne@69
|
1450 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
|
jpayne@69
|
1451 {
|
jpayne@69
|
1452 //-- Read in the B sequence
|
jpayne@69
|
1453 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
|
jpayne@69
|
1454 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
|
jpayne@69
|
1455 break;
|
jpayne@69
|
1456 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
|
jpayne@69
|
1457 {
|
jpayne@69
|
1458 fprintf(stderr,"%s\n", CurrSp->Bf.Id);
|
jpayne@69
|
1459 parseAbort ( "Query File" );
|
jpayne@69
|
1460 }
|
jpayne@69
|
1461 Bf.len = strlen (Bf.seq + 1);
|
jpayne@69
|
1462 }
|
jpayne@69
|
1463
|
jpayne@69
|
1464 //-- Extend clusters and create the alignment information
|
jpayne@69
|
1465 CurrSp->Bf.len = Bf.len;
|
jpayne@69
|
1466 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
|
jpayne@69
|
1467 }
|
jpayne@69
|
1468
|
jpayne@69
|
1469 //-- Create the cluster information and clear the Synteny structure
|
jpayne@69
|
1470 flushSyntenys (Syntenys, ClusterFile);
|
jpayne@69
|
1471
|
jpayne@69
|
1472 free (Bf.Id);
|
jpayne@69
|
1473 free (Bf.seq);
|
jpayne@69
|
1474
|
jpayne@69
|
1475 return;
|
jpayne@69
|
1476 }
|
jpayne@69
|
1477
|
jpayne@69
|
1478
|
jpayne@69
|
1479
|
jpayne@69
|
1480 inline long int transC
|
jpayne@69
|
1481 (long int Coord, int Frame, long int Len)
|
jpayne@69
|
1482
|
jpayne@69
|
1483 // Translate an amino acid coordinate to a nucleotide coordinate
|
jpayne@69
|
1484 // relative to the forward strand. The Len parameter should be the
|
jpayne@69
|
1485 // length of the original DNA strand.
|
jpayne@69
|
1486
|
jpayne@69
|
1487 {
|
jpayne@69
|
1488 assert ( Frame > 0 && Frame < 7 );
|
jpayne@69
|
1489 if ( Frame > 3 )
|
jpayne@69
|
1490 return revC ( (Coord * 3) - (3 - (Frame - 3)), Len );
|
jpayne@69
|
1491 else
|
jpayne@69
|
1492 return (Coord * 3) - (3 - (Frame));
|
jpayne@69
|
1493 }
|
jpayne@69
|
1494
|
jpayne@69
|
1495
|
jpayne@69
|
1496
|
jpayne@69
|
1497
|
jpayne@69
|
1498 inline long int refLen
|
jpayne@69
|
1499 (long int ntLen)
|
jpayne@69
|
1500
|
jpayne@69
|
1501 // Return the length of the concatenated amino acid sequence frames,
|
jpayne@69
|
1502 // including the appended 'X' at the end of each sequence frame for the
|
jpayne@69
|
1503 // given DNA input length. (The returned length will be the length of
|
jpayne@69
|
1504 // the concatenated frames for this sequence as output by prepro.cc)
|
jpayne@69
|
1505
|
jpayne@69
|
1506 {
|
jpayne@69
|
1507 return (ntLen + 1) << 1;
|
jpayne@69
|
1508 }
|
jpayne@69
|
1509
|
jpayne@69
|
1510
|
jpayne@69
|
1511
|
jpayne@69
|
1512
|
jpayne@69
|
1513 inline long int revC
|
jpayne@69
|
1514 (long int Coord, long int Len)
|
jpayne@69
|
1515
|
jpayne@69
|
1516 // Reverse complement the given coordinate for the given length.
|
jpayne@69
|
1517
|
jpayne@69
|
1518 {
|
jpayne@69
|
1519 assert (Len - Coord + 1 > 0);
|
jpayne@69
|
1520 return (Len - Coord + 1);
|
jpayne@69
|
1521 }
|
jpayne@69
|
1522
|
jpayne@69
|
1523
|
jpayne@69
|
1524
|
jpayne@69
|
1525
|
jpayne@69
|
1526 void printHelp
|
jpayne@69
|
1527 (const char * s)
|
jpayne@69
|
1528
|
jpayne@69
|
1529 // Display the program's help information to stderr.
|
jpayne@69
|
1530
|
jpayne@69
|
1531 {
|
jpayne@69
|
1532 fprintf (stderr,
|
jpayne@69
|
1533 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
|
jpayne@69
|
1534 fprintf (stderr,
|
jpayne@69
|
1535 "-b int set the alignment break (give-up) length to int (amino acids)\n"
|
jpayne@69
|
1536 "-d output only match clusters rather than extended alignments\n"
|
jpayne@69
|
1537 "-e do not extend alignments outward from clusters\n"
|
jpayne@69
|
1538 "-h display help information\n"
|
jpayne@69
|
1539 "-t force alignment to ends of sequence if within -b distance\n"
|
jpayne@69
|
1540 "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n"
|
jpayne@69
|
1541 " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n");
|
jpayne@69
|
1542 fprintf (stderr,
|
jpayne@69
|
1543 " Input is the output of the \"mgaps\" program from stdin, and\n"
|
jpayne@69
|
1544 "the two original PROmer sequence files passed on the command\n"
|
jpayne@69
|
1545 "line. <pfx> is the prefix to be added to the front of the\n"
|
jpayne@69
|
1546 "output file <pfx>.delta\n"
|
jpayne@69
|
1547 " <pfx>.delta is the alignment object that catalogs the distance\n"
|
jpayne@69
|
1548 "between insertions and deletions. For further information\n"
|
jpayne@69
|
1549 "regarding this file, please refer to the documentation under\n"
|
jpayne@69
|
1550 "the .delta output description.\n\n");
|
jpayne@69
|
1551 return;
|
jpayne@69
|
1552 }
|
jpayne@69
|
1553
|
jpayne@69
|
1554
|
jpayne@69
|
1555
|
jpayne@69
|
1556
|
jpayne@69
|
1557 void printUsage
|
jpayne@69
|
1558 (const char * s)
|
jpayne@69
|
1559
|
jpayne@69
|
1560 // Display the program's usage information to stderr.
|
jpayne@69
|
1561
|
jpayne@69
|
1562 {
|
jpayne@69
|
1563 fprintf (stderr,
|
jpayne@69
|
1564 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
|
jpayne@69
|
1565 fprintf (stderr, "Try '%s -h' for more information.\n", s);
|
jpayne@69
|
1566 return;
|
jpayne@69
|
1567 }
|
jpayne@69
|
1568
|
jpayne@69
|
1569
|
jpayne@69
|
1570
|
jpayne@69
|
1571
|
jpayne@69
|
1572 void validateData
|
jpayne@69
|
1573 (vector<Alignment> Alignments, vector<Cluster> Clusters,
|
jpayne@69
|
1574 const FastaRecord * Af, const FastaRecord * Bf)
|
jpayne@69
|
1575
|
jpayne@69
|
1576 // Self test function to check that the cluster and alignment information
|
jpayne@69
|
1577 // is valid
|
jpayne@69
|
1578
|
jpayne@69
|
1579 {
|
jpayne@69
|
1580 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
1581 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
|
jpayne@69
|
1582
|
jpayne@69
|
1583 long int Alen [7], Blen [7]; // the length of the amino acid sequences
|
jpayne@69
|
1584 vector<Cluster>::iterator Cp;
|
jpayne@69
|
1585 vector<Match>::iterator Mp;
|
jpayne@69
|
1586 vector<Alignment>::iterator Ap;
|
jpayne@69
|
1587 long int x, y, i;
|
jpayne@69
|
1588 char Xc, Yc;
|
jpayne@69
|
1589
|
jpayne@69
|
1590 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
|
jpayne@69
|
1591 {
|
jpayne@69
|
1592 assert ( Cp->wasFused );
|
jpayne@69
|
1593
|
jpayne@69
|
1594 //-- Initialize the right amino acid sequence for A and B if need be
|
jpayne@69
|
1595 if ( A [Cp->frameA] == NULL )
|
jpayne@69
|
1596 {
|
jpayne@69
|
1597 A [Cp->frameA] = (char *) Safe_malloc
|
jpayne@69
|
1598 ( sizeof(char) * ( (Af->len / 3) + 2) );
|
jpayne@69
|
1599 A [Cp->frameA] [0] = '\0';
|
jpayne@69
|
1600 Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA],
|
jpayne@69
|
1601 Cp->frameA );
|
jpayne@69
|
1602 }
|
jpayne@69
|
1603 if ( B [Cp->frameB] == NULL )
|
jpayne@69
|
1604 {
|
jpayne@69
|
1605 B [Cp->frameB] = (char *) Safe_malloc
|
jpayne@69
|
1606 ( sizeof(char) * ( (Bf->len / 3) + 2) );
|
jpayne@69
|
1607 B [Cp->frameB] [0] = '\0';
|
jpayne@69
|
1608 Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB],
|
jpayne@69
|
1609 Cp->frameB );
|
jpayne@69
|
1610 }
|
jpayne@69
|
1611
|
jpayne@69
|
1612 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
|
jpayne@69
|
1613 {
|
jpayne@69
|
1614 //-- assert for each match in cluster, it is indeed a match
|
jpayne@69
|
1615 x = Mp->sA;
|
jpayne@69
|
1616 y = Mp->sB;
|
jpayne@69
|
1617 for ( i = 0; i < Mp->len; i ++ )
|
jpayne@69
|
1618 assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] );
|
jpayne@69
|
1619
|
jpayne@69
|
1620 //-- assert for each match in cluster, it is contained in an alignment
|
jpayne@69
|
1621 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1622 {
|
jpayne@69
|
1623 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
|
jpayne@69
|
1624 Ap->eA >= Mp->sA + Mp->len - 1 &&
|
jpayne@69
|
1625 Ap->eB >= Mp->sB + Mp->len - 1 )
|
jpayne@69
|
1626 break;
|
jpayne@69
|
1627 }
|
jpayne@69
|
1628 assert ( Ap < Alignments.end( ) );
|
jpayne@69
|
1629 }
|
jpayne@69
|
1630 }
|
jpayne@69
|
1631
|
jpayne@69
|
1632 //-- assert alignments are optimal (quick check if first and last chars equal)
|
jpayne@69
|
1633 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
|
jpayne@69
|
1634 {
|
jpayne@69
|
1635 assert ( Ap->sA <= Ap->eA );
|
jpayne@69
|
1636 assert ( Ap->sB <= Ap->eB );
|
jpayne@69
|
1637
|
jpayne@69
|
1638 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
|
jpayne@69
|
1639 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
|
jpayne@69
|
1640 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
|
jpayne@69
|
1641 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
|
jpayne@69
|
1642
|
jpayne@69
|
1643 Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ?
|
jpayne@69
|
1644 A[Ap->frameA][Ap->sA] : STOP_CHAR);
|
jpayne@69
|
1645 Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ?
|
jpayne@69
|
1646 B[Ap->frameB][Ap->sB] : STOP_CHAR);
|
jpayne@69
|
1647 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
|
jpayne@69
|
1648
|
jpayne@69
|
1649 Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ?
|
jpayne@69
|
1650 A[Ap->frameA][Ap->eA] : STOP_CHAR);
|
jpayne@69
|
1651 Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ?
|
jpayne@69
|
1652 B[Ap->frameB][Ap->eB] : STOP_CHAR);
|
jpayne@69
|
1653 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
|
jpayne@69
|
1654 }
|
jpayne@69
|
1655
|
jpayne@69
|
1656 for ( i = 0; i < 7; i ++ )
|
jpayne@69
|
1657 {
|
jpayne@69
|
1658 if ( A [i] != NULL )
|
jpayne@69
|
1659 free ( A [i] );
|
jpayne@69
|
1660 if ( B [i] != NULL )
|
jpayne@69
|
1661 free ( B [i] );
|
jpayne@69
|
1662 }
|
jpayne@69
|
1663
|
jpayne@69
|
1664 return;
|
jpayne@69
|
1665 }
|