annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postpro.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 //------------------------------------------------------------------------------
jpayne@69 2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
jpayne@69 3 // File: postpro.cc
jpayne@69 4 // Date: 08 / 20 / 2002
jpayne@69 5 //
jpayne@69 6 // Purpose: To translate the coordinates referencing the concatenated
jpayne@69 7 // reference sequences back to the individual sequences and deal
jpayne@69 8 // with any conflict that may have arisen (i.e. a MUM that spans
jpayne@69 9 // the boundry between two sequences). Then to extend each cluster
jpayne@69 10 // via Smith-Waterman techniques to expand the alignment coverage.
jpayne@69 11 // Alignments which encounter each other will be fused to form one
jpayne@69 12 // encompasing alignment when appropriate.
jpayne@69 13 //
jpayne@69 14 // Input: Input is the output of the .mgaps program from stdin. On the
jpayne@69 15 // command line, the file names of the two original sequence files
jpayne@69 16 // should come first, followed by the prefix <pfx> that should be
jpayne@69 17 // placed in front of the two output filenames <pfx>.cluster and
jpayne@69 18 // <pfx>.delta
jpayne@69 19 //
jpayne@69 20 // NOTE: Cluster file is now suppressed by default (see -d option).
jpayne@69 21 //
jpayne@69 22 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
jpayne@69 23 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
jpayne@69 24 // However, the clusters now reference their corresponding
jpayne@69 25 // sequence and are all listed under headers that specify this
jpayne@69 26 // sequence. In addition, the coordinates now reference the original
jpayne@69 27 // DNA input and not the amino acid translations. The <pfx>.delta file
jpayne@69 28 // is the alignment object that contains all the information necessary
jpayne@69 29 // to reproduce the alignments generated by the MUM extension process.
jpayne@69 30 // The coordinates in this file reference the original DNA input, however
jpayne@69 31 // the delta values represent amino acids, so 1 delta = 3 nucleotides.
jpayne@69 32 // Please refer to the output file documentation for an in-depth
jpayne@69 33 // description of these file formats.
jpayne@69 34 //
jpayne@69 35 // Usage: postpro <reference> <query> <pfx> < <input>
jpayne@69 36 // Where <reference> and <query> are the original input sequences of
jpayne@69 37 // PROmer and <pfx> is the prefix that should be added to the
jpayne@69 38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
jpayne@69 39 //
jpayne@69 40 //------------------------------------------------------------------------------
jpayne@69 41
jpayne@69 42 //-- NOTE: this option will significantly hamper program performance,
jpayne@69 43 // mostly the alignment extension performance (sw_align.h)
jpayne@69 44 //#define _DEBUG_ASSERT // self testing assert functions
jpayne@69 45
jpayne@69 46 #include "tigrinc.hh"
jpayne@69 47 #include "sw_align.hh"
jpayne@69 48 #include "translate.hh"
jpayne@69 49 #include <vector>
jpayne@69 50 #include <algorithm>
jpayne@69 51 using namespace std;
jpayne@69 52
jpayne@69 53
jpayne@69 54
jpayne@69 55
jpayne@69 56 //------------------------------------------------------ Globals -------------//
jpayne@69 57 bool DO_DELTA = true;
jpayne@69 58 bool DO_EXTEND = true;
jpayne@69 59 bool TO_SEQEND = false;
jpayne@69 60
jpayne@69 61
jpayne@69 62 //------------------------------------------------------ Type Definitions ----//
jpayne@69 63 enum LineType
jpayne@69 64 //-- The type of input line from <stdin>
jpayne@69 65 {
jpayne@69 66 NO_LINE, HEADER_LINE, MATCH_LINE
jpayne@69 67 };
jpayne@69 68
jpayne@69 69
jpayne@69 70 struct FastaRecord
jpayne@69 71 //-- The essential data of a sequence
jpayne@69 72 {
jpayne@69 73 char * Id; // the fasta ID header tag
jpayne@69 74 long int len; // the length of the sequence
jpayne@69 75 char * seq; // the sequence data
jpayne@69 76 };
jpayne@69 77
jpayne@69 78
jpayne@69 79 struct Match
jpayne@69 80 //-- An exact match between two sequences A and B
jpayne@69 81 {
jpayne@69 82 long int sA, sB, len; // start coordinate in A, in B and the length
jpayne@69 83 };
jpayne@69 84
jpayne@69 85
jpayne@69 86 struct Cluster
jpayne@69 87 //-- An ordered list of matches between two sequences A and B
jpayne@69 88 {
jpayne@69 89 bool wasFused; // have the cluster matches been extended yet?
jpayne@69 90 int frameA; // the reference sequence frame (1-6)
jpayne@69 91 int frameB; // the query sequence frame
jpayne@69 92 vector<Match> matches; // the ordered set of matches in the cluster
jpayne@69 93 };
jpayne@69 94
jpayne@69 95
jpayne@69 96 struct Synteny
jpayne@69 97 //-- An ordered list of clusters between two sequences A and B
jpayne@69 98 {
jpayne@69 99 FastaRecord * AfP; // a pointer to the reference sequence record
jpayne@69 100 FastaRecord Bf; // the query sequence record (w/o the sequence)
jpayne@69 101 vector<Cluster> clusters; // the ordered set of clusters between A and B
jpayne@69 102 };
jpayne@69 103
jpayne@69 104
jpayne@69 105 struct Alignment
jpayne@69 106 //-- An alignment object between two sequences A and B
jpayne@69 107 {
jpayne@69 108 int frameA; // the reference sequence frame (1-6)
jpayne@69 109 int frameB; // the query sequence frame
jpayne@69 110 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
jpayne@69 111 vector<long int> delta; // the delta values, with NO zero at the end
jpayne@69 112 long int deltaApos; // sum of abs(deltas) - #of negative deltas
jpayne@69 113 // trust me, it is a very helpful value
jpayne@69 114 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
jpayne@69 115 };
jpayne@69 116
jpayne@69 117
jpayne@69 118 struct AscendingClusterSort
jpayne@69 119 //-- For sorting clusters in ascending order of their sA coordinate
jpayne@69 120 {
jpayne@69 121 bool operator() (const Cluster & pA, const Cluster & pB)
jpayne@69 122 {
jpayne@69 123 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
jpayne@69 124 }
jpayne@69 125 };
jpayne@69 126
jpayne@69 127
jpayne@69 128
jpayne@69 129
jpayne@69 130
jpayne@69 131 //------------------------------------------------- Function Declarations ----//
jpayne@69 132 void addNewAlignment
jpayne@69 133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
jpayne@69 134 vector<Match>::iterator Mp);
jpayne@69 135
jpayne@69 136 bool extendBackward
jpayne@69 137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
jpayne@69 138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
jpayne@69 139
jpayne@69 140 bool extendForward
jpayne@69 141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
jpayne@69 142 const char * B, long int targetB, unsigned int m_o);
jpayne@69 143
jpayne@69 144 void extendClusters
jpayne@69 145 (vector<Cluster> & Clusters,
jpayne@69 146 const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile);
jpayne@69 147
jpayne@69 148 void flushAlignments
jpayne@69 149 (vector<Alignment> & Alignments,
jpayne@69 150 const FastaRecord * Af, const FastaRecord * Bf,
jpayne@69 151 FILE * DeltaFile);
jpayne@69 152
jpayne@69 153 void flushSyntenys
jpayne@69 154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
jpayne@69 155
jpayne@69 156 vector<Cluster>::iterator getForwardTargetCluster
jpayne@69 157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
jpayne@69 158 long int & targetA, long int & targetB);
jpayne@69 159
jpayne@69 160 vector<Alignment>::iterator getReverseTargetAlignment
jpayne@69 161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
jpayne@69 162
jpayne@69 163 bool isShadowedCluster
jpayne@69 164 (vector<Cluster>::iterator CurrCp,
jpayne@69 165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
jpayne@69 166
jpayne@69 167 void parseAbort
jpayne@69 168 (const char *);
jpayne@69 169
jpayne@69 170 void parseDelta
jpayne@69 171 (vector<Alignment> & Alignments,
jpayne@69 172 const FastaRecord * Af, const FastaRecord *Bf);
jpayne@69 173
jpayne@69 174 void processSyntenys
jpayne@69 175 (vector<Synteny> & Syntenys,
jpayne@69 176 FastaRecord * Af, long int As,
jpayne@69 177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
jpayne@69 178
jpayne@69 179 inline long int transC
jpayne@69 180 (long int Coord, int Frame, long int Len);
jpayne@69 181
jpayne@69 182 inline long int refLen
jpayne@69 183 (long int ntLen);
jpayne@69 184
jpayne@69 185 inline long int revC
jpayne@69 186 (long int Coord, long int Len);
jpayne@69 187
jpayne@69 188 void printHelp
jpayne@69 189 (const char *);
jpayne@69 190
jpayne@69 191 void printUsage
jpayne@69 192 (const char *);
jpayne@69 193
jpayne@69 194 void validateData
jpayne@69 195 (vector<Alignment> Alignments, vector<Cluster> Clusters,
jpayne@69 196 const FastaRecord * Af, const FastaRecord * Bf);
jpayne@69 197
jpayne@69 198
jpayne@69 199
jpayne@69 200 //-------------------------------------------------- Function Definitions ----//
jpayne@69 201 int main
jpayne@69 202 (int argc, char ** argv)
jpayne@69 203 {
jpayne@69 204 FastaRecord * Af; // array of all the reference sequences
jpayne@69 205
jpayne@69 206 vector<Synteny> Syntenys; // vector of all sets of clusters
jpayne@69 207 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
jpayne@69 208 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
jpayne@69 209
jpayne@69 210 Synteny Asyn; // a single set of clusters
jpayne@69 211 Cluster Aclu; // a single cluster of matches
jpayne@69 212 Match Amat; // a single match
jpayne@69 213
jpayne@69 214 LineType PrevLine; // the current input line
jpayne@69 215
jpayne@69 216 bool Found; // temporary search flag
jpayne@69 217
jpayne@69 218 char Line [MAX_LINE]; // a single line of input
jpayne@69 219 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
jpayne@69 220 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
jpayne@69 221 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
jpayne@69 222
jpayne@69 223 int FrameA, FrameB; // reading frames (1-6)
jpayne@69 224 int CurrFrameA, CurrFrameB;
jpayne@69 225
jpayne@69 226 long int i; // temporary counter
jpayne@69 227 long int Seqi; // current reference sequence index
jpayne@69 228 long int InitSize; // initial sequence memory space
jpayne@69 229 long int As = 0; // the size of the reference array
jpayne@69 230 long int Ac = 100; // the capacity of the reference array
jpayne@69 231 long int sA, sB, len; // current match start in A, B and length
jpayne@69 232
jpayne@69 233 FILE * RefFile, * QryFile; // reference and query input files
jpayne@69 234 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
jpayne@69 235
jpayne@69 236 //-- Set the alignment data type and break length (sw_align.h)
jpayne@69 237 setMatrixType ( BLOSUM62 );
jpayne@69 238 setBreakLen ( 60 );
jpayne@69 239
jpayne@69 240 //-- Parse the command line arguments
jpayne@69 241 {
jpayne@69 242 optarg = NULL;
jpayne@69 243 int ch, errflg = 0;
jpayne@69 244 while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) )
jpayne@69 245 switch (ch)
jpayne@69 246 {
jpayne@69 247 case 'b' :
jpayne@69 248 setBreakLen (atoi (optarg));
jpayne@69 249 break;
jpayne@69 250
jpayne@69 251 case 'd' :
jpayne@69 252 DO_DELTA = false;
jpayne@69 253 break;
jpayne@69 254
jpayne@69 255 case 'e' :
jpayne@69 256 DO_EXTEND = false;
jpayne@69 257 break;
jpayne@69 258
jpayne@69 259 case 'h' :
jpayne@69 260 printHelp (argv[0]);
jpayne@69 261 exit (EXIT_SUCCESS);
jpayne@69 262 break;
jpayne@69 263
jpayne@69 264 case 't' :
jpayne@69 265 TO_SEQEND = true;
jpayne@69 266 break;
jpayne@69 267
jpayne@69 268 case 'x' :
jpayne@69 269 setMatrixType ( atoi (optarg) );
jpayne@69 270 if ( getMatrixType( ) == NUCLEOTIDE )
jpayne@69 271 {
jpayne@69 272 fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n",
jpayne@69 273 getMatrixType( ));
jpayne@69 274 setMatrixType ( BLOSUM62 );
jpayne@69 275 }
jpayne@69 276 break;
jpayne@69 277
jpayne@69 278 default :
jpayne@69 279 errflg ++;
jpayne@69 280 }
jpayne@69 281
jpayne@69 282 if ( errflg > 0 || argc - optind != 3 )
jpayne@69 283 {
jpayne@69 284 printUsage (argv[0]);
jpayne@69 285 exit (EXIT_FAILURE);
jpayne@69 286 }
jpayne@69 287 }
jpayne@69 288
jpayne@69 289 //-- Read and create the I/O file names
jpayne@69 290 strcpy (RefFileName, argv[optind ++]);
jpayne@69 291 strcpy (QryFileName, argv[optind ++]);
jpayne@69 292 strcpy (ClusterFileName, argv[optind ++]);
jpayne@69 293 strcpy (DeltaFileName, ClusterFileName);
jpayne@69 294 strcat (ClusterFileName, ".cluster");
jpayne@69 295 strcat (DeltaFileName, ".delta");
jpayne@69 296
jpayne@69 297 //-- Open all the files
jpayne@69 298 RefFile = File_Open (RefFileName, "r");
jpayne@69 299 QryFile = File_Open (QryFileName, "r");
jpayne@69 300 if ( DO_DELTA )
jpayne@69 301 {
jpayne@69 302 ClusterFile = NULL;
jpayne@69 303 DeltaFile = File_Open (DeltaFileName, "w");
jpayne@69 304 }
jpayne@69 305 else
jpayne@69 306 {
jpayne@69 307 ClusterFile = File_Open (ClusterFileName, "w");
jpayne@69 308 DeltaFile = NULL;
jpayne@69 309 }
jpayne@69 310
jpayne@69 311 //-- Print the headers of the output files
jpayne@69 312 if ( DO_DELTA )
jpayne@69 313 fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
jpayne@69 314 else
jpayne@69 315 fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName);
jpayne@69 316
jpayne@69 317 //-- Generate the array of the reference sequences
jpayne@69 318 InitSize = INIT_SIZE;
jpayne@69 319 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
jpayne@69 320 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 321 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
jpayne@69 322 {
jpayne@69 323 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
jpayne@69 324 strcpy (Af[As].Id, IdA);
jpayne@69 325
jpayne@69 326 Af[As].len = strlen (Af[As].seq + 1);
jpayne@69 327
jpayne@69 328 if ( ++ As >= Ac )
jpayne@69 329 {
jpayne@69 330 Ac *= 2;
jpayne@69 331 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
jpayne@69 332 }
jpayne@69 333
jpayne@69 334 InitSize = INIT_SIZE;
jpayne@69 335 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 336 }
jpayne@69 337 fclose (RefFile);
jpayne@69 338 if ( As <= 0 )
jpayne@69 339 parseAbort ( RefFileName );
jpayne@69 340
jpayne@69 341
jpayne@69 342
jpayne@69 343 //-- Process the input from <stdin> line by line
jpayne@69 344 PrevLine = NO_LINE;
jpayne@69 345 IdA[0] = '\0';
jpayne@69 346 IdB[0] = '\0';
jpayne@69 347 FrameA = 0;
jpayne@69 348 FrameB = 0;
jpayne@69 349 CurrFrameA = -1;
jpayne@69 350 CurrFrameB = -1;
jpayne@69 351 CurrIdB[0] = '\0';
jpayne@69 352 while ( fgets(Line, MAX_LINE, stdin) != NULL )
jpayne@69 353 {
jpayne@69 354
jpayne@69 355 //-- If the current line is a fasta HEADER_LINE
jpayne@69 356 if ( Line[0] == '>' )
jpayne@69 357 {
jpayne@69 358 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
jpayne@69 359 parseAbort ("stdin");
jpayne@69 360
jpayne@69 361 sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB);
jpayne@69 362 CurrIdB [strlen(CurrIdB) - 2] = '\0';
jpayne@69 363
jpayne@69 364 PrevLine = HEADER_LINE;
jpayne@69 365 }
jpayne@69 366
jpayne@69 367
jpayne@69 368 //-- If the current line is a cluster HEADER_LINE
jpayne@69 369 else if ( Line[0] == '#' )
jpayne@69 370 {
jpayne@69 371 PrevLine = HEADER_LINE;
jpayne@69 372 }
jpayne@69 373
jpayne@69 374
jpayne@69 375 //-- If the current line is a MATCH_LINE
jpayne@69 376 else
jpayne@69 377 {
jpayne@69 378 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
jpayne@69 379 parseAbort ("stdin");
jpayne@69 380
jpayne@69 381 //-- Re-map the reference coordinate back to its original sequence
jpayne@69 382 // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length
jpayne@69 383 // including the appended 'X' on each frame translation
jpayne@69 384 for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ )
jpayne@69 385 sA -= refLen (Af[Seqi].len);
jpayne@69 386
jpayne@69 387 assert ( Seqi < As );
jpayne@69 388
jpayne@69 389 //-- Get the correct frame, translate startA coordinate to frame
jpayne@69 390 for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ )
jpayne@69 391 sA -= (Af[Seqi].len - (i % 3)) / 3 + 1;
jpayne@69 392 CurrFrameA = i + 1;
jpayne@69 393 assert ( CurrFrameA > 0 && CurrFrameA < 7 );
jpayne@69 394
jpayne@69 395 //-- If the match spans across a frame boundry
jpayne@69 396 if ( CurrFrameA < 1 || CurrFrameA > 6 ||
jpayne@69 397 sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 ||
jpayne@69 398 sA <= 0 )
jpayne@69 399 {
jpayne@69 400 fprintf (stderr,
jpayne@69 401 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
jpayne@69 402 " Reference sequence '>%s', frame %d\n"
jpayne@69 403 "Please file a bug report\n"
jpayne@69 404 "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA);
jpayne@69 405 continue;
jpayne@69 406 }
jpayne@69 407
jpayne@69 408 //-- If the match spans across a sequence boundry
jpayne@69 409 if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 )
jpayne@69 410 {
jpayne@69 411 fprintf (stderr,
jpayne@69 412 "\nWARNING: A MUM was found extending beyond the boundry of:\n"
jpayne@69 413 " Reference sequence '>%s'\n"
jpayne@69 414 "Please file a bug report\n"
jpayne@69 415 "Attempting to continue.\n", Af[Seqi].Id);
jpayne@69 416 continue;
jpayne@69 417 }
jpayne@69 418
jpayne@69 419 //-- Check and update the current synteny region
jpayne@69 420 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ||
jpayne@69 421 CurrFrameA != FrameA || CurrFrameB != FrameB )
jpayne@69 422 {
jpayne@69 423 Found = false;
jpayne@69 424 if ( strcmp (IdB, CurrIdB) == 0 )
jpayne@69 425 {
jpayne@69 426 //-- Has this header been seen before?
jpayne@69 427 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
jpayne@69 428 {
jpayne@69 429 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
jpayne@69 430 {
jpayne@69 431 assert ( Sp->AfP->len == Af[Seqi].len );
jpayne@69 432 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
jpayne@69 433 CurrSp = Sp;
jpayne@69 434 Found = true;
jpayne@69 435 break;
jpayne@69 436 }
jpayne@69 437 }
jpayne@69 438 }
jpayne@69 439 else
jpayne@69 440 {
jpayne@69 441 //-- New B sequence header, process all the old synteny's
jpayne@69 442 processSyntenys (Syntenys, Af, As,
jpayne@69 443 QryFile, ClusterFile, DeltaFile);
jpayne@69 444
jpayne@69 445 }
jpayne@69 446
jpayne@69 447 strcpy (IdA, Af[Seqi].Id);
jpayne@69 448 strcpy (IdB, CurrIdB);
jpayne@69 449 FrameA = CurrFrameA;
jpayne@69 450 FrameB = CurrFrameB;
jpayne@69 451
jpayne@69 452 //-- If not seen yet, create a new synteny region
jpayne@69 453 if ( ! Found )
jpayne@69 454 {
jpayne@69 455 Asyn.AfP = Af + Seqi;
jpayne@69 456 Asyn.Bf.len = -1;
jpayne@69 457 Asyn.Bf.Id = (char *) Safe_malloc
jpayne@69 458 (sizeof(char) * (strlen(IdB) + 1));
jpayne@69 459 strcpy (Asyn.Bf.Id, IdB);
jpayne@69 460
jpayne@69 461 Syntenys.push_back (Asyn);
jpayne@69 462 CurrSp = Syntenys.rbegin( );
jpayne@69 463 }
jpayne@69 464
jpayne@69 465 //-- Add a new cluster to the current synteny
jpayne@69 466 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 467 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 468 CurrSp->clusters.pop_back( ); // hack to remove empties
jpayne@69 469 Aclu.frameA = FrameA;
jpayne@69 470 Aclu.frameB = FrameB;
jpayne@69 471 Aclu.wasFused = false;
jpayne@69 472 CurrSp->clusters.push_back (Aclu);
jpayne@69 473 }
jpayne@69 474 else if ( PrevLine == HEADER_LINE )
jpayne@69 475 {
jpayne@69 476 //-- Add a new cluster to the current synteny
jpayne@69 477 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 478 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 479 CurrSp->clusters.pop_back( );
jpayne@69 480 Aclu.frameA = FrameA;
jpayne@69 481 Aclu.frameB = FrameB;
jpayne@69 482 Aclu.wasFused = false;
jpayne@69 483 CurrSp->clusters.push_back (Aclu);
jpayne@69 484 }
jpayne@69 485
jpayne@69 486 //-- Add a new match to the current cluster
jpayne@69 487 // NOTE: A and B coordinates still reference the appropriate
jpayne@69 488 // amino acid sequence frame, not the DNA (same with len)
jpayne@69 489 if ( len > 1 )
jpayne@69 490 {
jpayne@69 491 Amat.sA = sA;
jpayne@69 492 Amat.sB = sB;
jpayne@69 493 Amat.len = len;
jpayne@69 494 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
jpayne@69 495 }
jpayne@69 496
jpayne@69 497 PrevLine = MATCH_LINE;
jpayne@69 498 }
jpayne@69 499 }
jpayne@69 500
jpayne@69 501 //-- Process the left-over syntenys
jpayne@69 502 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 503 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 504 CurrSp->clusters.pop_back( );
jpayne@69 505 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
jpayne@69 506 fclose (QryFile);
jpayne@69 507
jpayne@69 508 //-- Free the reference sequences
jpayne@69 509 for ( i = 0; i < As; i ++ )
jpayne@69 510 {
jpayne@69 511 free (Af[i].Id);
jpayne@69 512 free (Af[i].seq);
jpayne@69 513 }
jpayne@69 514 free (Af);
jpayne@69 515
jpayne@69 516 return EXIT_SUCCESS;
jpayne@69 517 }
jpayne@69 518
jpayne@69 519
jpayne@69 520
jpayne@69 521
jpayne@69 522 void addNewAlignment
jpayne@69 523 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
jpayne@69 524 vector<Match>::iterator Mp)
jpayne@69 525
jpayne@69 526 // Create and add a new alignment object based on the current match
jpayne@69 527 // and cluster information pointed to by Cp and Mp.
jpayne@69 528
jpayne@69 529 {
jpayne@69 530 Alignment Align;
jpayne@69 531
jpayne@69 532 //-- Initialize the data
jpayne@69 533 Align.sA = Mp->sA;
jpayne@69 534 Align.sB = Mp->sB;
jpayne@69 535 Align.eA = Mp->sA + Mp->len - 1;
jpayne@69 536 Align.eB = Mp->sB + Mp->len - 1;
jpayne@69 537 Align.frameA = Cp->frameA;
jpayne@69 538 Align.frameB = Cp->frameB;
jpayne@69 539 Align.delta.clear( );
jpayne@69 540 Align.deltaApos = 0;
jpayne@69 541
jpayne@69 542 //-- Add the alignment object
jpayne@69 543 Alignments.push_back (Align);
jpayne@69 544
jpayne@69 545 return;
jpayne@69 546 }
jpayne@69 547
jpayne@69 548
jpayne@69 549
jpayne@69 550
jpayne@69 551 bool extendBackward
jpayne@69 552 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
jpayne@69 553 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
jpayne@69 554
jpayne@69 555 // Extend an alignment backwards off of the current alignment object.
jpayne@69 556 // The current alignment object must be freshly created and consist
jpayne@69 557 // only of an exact match (i.e. the delta vector MUST be empty).
jpayne@69 558 // If the TargetAp alignment object is reached by the extension, it will
jpayne@69 559 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
jpayne@69 560 // NULL the function will extend as far as possible. It is a strange
jpayne@69 561 // and dangerous function because it can delete CurrAp, so edit with
jpayne@69 562 // caution. Returns true if TargetAp was reached and merged, else false
jpayne@69 563 // Designed only as a subroutine for extendClusters, should be used
jpayne@69 564 // nowhere else.
jpayne@69 565
jpayne@69 566 {
jpayne@69 567 bool target_reached = false;
jpayne@69 568 bool overflow_flag = false;
jpayne@69 569 bool double_flag = false;
jpayne@69 570
jpayne@69 571 vector<long int>::iterator Dp;
jpayne@69 572
jpayne@69 573 unsigned int m_o;
jpayne@69 574 long int targetA, targetB;
jpayne@69 575
jpayne@69 576 m_o = BACKWARD_SEARCH;
jpayne@69 577
jpayne@69 578 //-- Set the target coordinates
jpayne@69 579 if ( TargetAp != Alignments.end( ) )
jpayne@69 580 {
jpayne@69 581 targetA = TargetAp->eA;
jpayne@69 582 targetB = TargetAp->eB;
jpayne@69 583 }
jpayne@69 584 else
jpayne@69 585 {
jpayne@69 586 targetA = 1;
jpayne@69 587 targetB = 1;
jpayne@69 588 m_o |= OPTIMAL_BIT;
jpayne@69 589 }
jpayne@69 590
jpayne@69 591 //-- If alignment is too long, bring with bounds and set overflow_flag true
jpayne@69 592 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
jpayne@69 593 {
jpayne@69 594 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
jpayne@69 595 overflow_flag = true;
jpayne@69 596 m_o |= OPTIMAL_BIT;
jpayne@69 597 }
jpayne@69 598 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
jpayne@69 599 {
jpayne@69 600 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
jpayne@69 601 if ( overflow_flag )
jpayne@69 602 double_flag = true;
jpayne@69 603 else
jpayne@69 604 overflow_flag = true;
jpayne@69 605 m_o |= OPTIMAL_BIT;
jpayne@69 606 }
jpayne@69 607
jpayne@69 608
jpayne@69 609 if ( TO_SEQEND && !double_flag )
jpayne@69 610 m_o |= SEQEND_BIT;
jpayne@69 611
jpayne@69 612
jpayne@69 613 //-- Attempt to reach the target
jpayne@69 614 target_reached = alignSearch (A, CurrAp->sA, targetA,
jpayne@69 615 B, CurrAp->sB, targetB, m_o);
jpayne@69 616
jpayne@69 617 if ( overflow_flag || TargetAp == Alignments.end( ) )
jpayne@69 618 target_reached = false;
jpayne@69 619
jpayne@69 620 if ( target_reached )
jpayne@69 621 {
jpayne@69 622 //-- assert that CurrAp is new and at the end of the Alignment vector
jpayne@69 623 assert ( CurrAp->delta.empty( ) );
jpayne@69 624 assert ( CurrAp == Alignments.end( ) - 1 );
jpayne@69 625
jpayne@69 626 //-- Merge the two alignment objects
jpayne@69 627 extendForward (TargetAp, A, CurrAp->sA,
jpayne@69 628 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
jpayne@69 629 TargetAp->eA = CurrAp->eA;
jpayne@69 630 TargetAp->eB = CurrAp->eB;
jpayne@69 631 Alignments.pop_back( );
jpayne@69 632 }
jpayne@69 633 else
jpayne@69 634 {
jpayne@69 635 alignTarget (A, targetA, CurrAp->sA,
jpayne@69 636 B, targetB, CurrAp->sB,
jpayne@69 637 CurrAp->delta, FORCED_FORWARD_ALIGN);
jpayne@69 638 CurrAp->sA = targetA;
jpayne@69 639 CurrAp->sB = targetB;
jpayne@69 640
jpayne@69 641 //-- Update the deltaApos value for the alignment object
jpayne@69 642 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
jpayne@69 643 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
jpayne@69 644 }
jpayne@69 645
jpayne@69 646 return target_reached;
jpayne@69 647 }
jpayne@69 648
jpayne@69 649
jpayne@69 650
jpayne@69 651
jpayne@69 652 bool extendForward
jpayne@69 653 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
jpayne@69 654 const char * B, long int targetB, unsigned int m_o)
jpayne@69 655
jpayne@69 656 // Extend an alignment forwards off the current alignment object until
jpayne@69 657 // target or end of sequence is reached, and merge the delta values of the
jpayne@69 658 // alignment object with the new delta values generated by the extension.
jpayne@69 659 // Return true if the target was reached, else false
jpayne@69 660
jpayne@69 661 {
jpayne@69 662 long int ValA;
jpayne@69 663 long int ValB;
jpayne@69 664 unsigned int Di;
jpayne@69 665 bool target_reached;
jpayne@69 666 bool overflow_flag = false;
jpayne@69 667 bool double_flag = false;
jpayne@69 668 vector<long int>::iterator Dp;
jpayne@69 669
jpayne@69 670 //-- Set Di to the end of the delta vector
jpayne@69 671 Di = CurrAp->delta.size( );
jpayne@69 672
jpayne@69 673 //-- Calculate the distance between the start and end positions
jpayne@69 674 ValA = targetA - CurrAp->eA + 1;
jpayne@69 675 ValB = targetB - CurrAp->eB + 1;
jpayne@69 676
jpayne@69 677 //-- If the distance is too long, shrink it and set the overflow_flag
jpayne@69 678 if ( ValA > MAX_ALIGNMENT_LENGTH )
jpayne@69 679 {
jpayne@69 680 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
jpayne@69 681 overflow_flag = true;
jpayne@69 682 m_o |= OPTIMAL_BIT;
jpayne@69 683 }
jpayne@69 684 if ( ValB > MAX_ALIGNMENT_LENGTH )
jpayne@69 685 {
jpayne@69 686 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
jpayne@69 687 if ( overflow_flag )
jpayne@69 688 double_flag = true;
jpayne@69 689 else
jpayne@69 690 overflow_flag = true;
jpayne@69 691 m_o |= OPTIMAL_BIT;
jpayne@69 692 }
jpayne@69 693
jpayne@69 694 if ( double_flag )
jpayne@69 695 m_o &= ~SEQEND_BIT;
jpayne@69 696
jpayne@69 697 target_reached = alignTarget (A, CurrAp->eA, targetA,
jpayne@69 698 B, CurrAp->eB, targetB,
jpayne@69 699 CurrAp->delta, m_o);
jpayne@69 700
jpayne@69 701 //-- Notify user if alignment was chopped short
jpayne@69 702 if ( target_reached && overflow_flag )
jpayne@69 703 target_reached = false;
jpayne@69 704
jpayne@69 705 //-- Pick up delta where left off (Di) and merge with new deltas
jpayne@69 706 if ( Di < CurrAp->delta.size( ) )
jpayne@69 707 {
jpayne@69 708 //-- Merge the deltas
jpayne@69 709 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
jpayne@69 710 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
jpayne@69 711 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
jpayne@69 712 {
jpayne@69 713 fprintf(stderr,
jpayne@69 714 "ERROR: failed to merge alignments at position %ld\n"
jpayne@69 715 " Please file a bug report\n",
jpayne@69 716 CurrAp->eA);
jpayne@69 717 exit (EXIT_FAILURE);
jpayne@69 718 }
jpayne@69 719
jpayne@69 720 //-- Update the deltaApos
jpayne@69 721 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
jpayne@69 722 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
jpayne@69 723 }
jpayne@69 724
jpayne@69 725 //-- Set the alignment coordinates
jpayne@69 726 CurrAp->eA = targetA;
jpayne@69 727 CurrAp->eB = targetB;
jpayne@69 728
jpayne@69 729 return target_reached;
jpayne@69 730 }
jpayne@69 731
jpayne@69 732
jpayne@69 733
jpayne@69 734
jpayne@69 735 void extendClusters
jpayne@69 736 (vector<Cluster> & Clusters,
jpayne@69 737 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
jpayne@69 738
jpayne@69 739 // Connect all the matches in every cluster between sequences A and B.
jpayne@69 740 // Also, extend alignments off of the front and back of each cluster to
jpayne@69 741 // expand total alignment coverage. When these extensions encounter an
jpayne@69 742 // adjacent cluster (in a consistent frame), fuse the two regions to
jpayne@69 743 // create one single encompassing region. This routine will create
jpayne@69 744 // alignment objects from these extensions and output the resulting
jpayne@69 745 // delta information to the delta output file. Also, all coordintes
jpayne@69 746 // for the clusters and the alignment regions are translated to reference
jpayne@69 747 // the original DNA sequecne.
jpayne@69 748
jpayne@69 749 {
jpayne@69 750 //-- Sort the clusters (ascending) by their start coordinate in sequence A
jpayne@69 751 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
jpayne@69 752
jpayne@69 753 //-- If no delta file is requested
jpayne@69 754 if ( ! DO_DELTA )
jpayne@69 755 return;
jpayne@69 756
jpayne@69 757
jpayne@69 758 bool target_reached = false; // reached the adjacent match or cluster
jpayne@69 759 // the amino acid sequences for A and B
jpayne@69 760 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 761 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 762
jpayne@69 763 long int Alen [7], Blen [7]; // the length of the amino acid sequences
jpayne@69 764 int i;
jpayne@69 765
jpayne@69 766 unsigned int m_o;
jpayne@69 767 long int targetA, targetB; // alignment extension targets in A and B
jpayne@69 768
jpayne@69 769 vector<Match>::iterator Mp; // match pointer
jpayne@69 770
jpayne@69 771 vector<Cluster>::iterator PrevCp; // where the extensions last left off
jpayne@69 772 vector<Cluster>::iterator CurrCp; // the current cluster being extended
jpayne@69 773 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
jpayne@69 774
jpayne@69 775 vector<Alignment> Alignments; // the vector of alignment objects
jpayne@69 776 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
jpayne@69 777 vector<Alignment>::iterator TargetAp; // target align
jpayne@69 778
jpayne@69 779 //-- Calculate the length of each amino acid frame translation
jpayne@69 780 for ( i = 0; i < 6; i ++ )
jpayne@69 781 {
jpayne@69 782 Alen [i + 1] = (Af->len - (i % 3)) / 3;
jpayne@69 783 Blen [i + 1] = (Bf->len - (i % 3)) / 3;
jpayne@69 784 }
jpayne@69 785
jpayne@69 786 //-- Extend each cluster
jpayne@69 787 PrevCp = Clusters.begin( );
jpayne@69 788 CurrCp = Clusters.begin( );
jpayne@69 789 while ( CurrCp < Clusters.end( ) )
jpayne@69 790 {
jpayne@69 791 if ( DO_EXTEND )
jpayne@69 792 {
jpayne@69 793 //-- Ignore if shadowed or already extended
jpayne@69 794 if ( ! target_reached )
jpayne@69 795 if ( CurrCp->wasFused ||
jpayne@69 796 isShadowedCluster (CurrCp, Alignments, CurrAp) )
jpayne@69 797 {
jpayne@69 798 CurrCp->wasFused = true;
jpayne@69 799 CurrCp = ++ PrevCp;
jpayne@69 800 continue;
jpayne@69 801 }
jpayne@69 802 }
jpayne@69 803
jpayne@69 804 //-- Initialize the right amino acid sequence for A and B if need be
jpayne@69 805 if ( A [CurrCp->frameA] == NULL )
jpayne@69 806 {
jpayne@69 807 A [CurrCp->frameA] = (char *) Safe_malloc
jpayne@69 808 ( sizeof(char) * ( (Af->len / 3) + 2) );
jpayne@69 809 A [CurrCp->frameA] [0] = '\0';
jpayne@69 810 Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA],
jpayne@69 811 CurrCp->frameA );
jpayne@69 812 }
jpayne@69 813 if ( B [CurrCp->frameB] == NULL )
jpayne@69 814 {
jpayne@69 815 B [CurrCp->frameB] = (char *) Safe_malloc
jpayne@69 816 ( sizeof(char) * ( (Bf->len / 3) + 2) );
jpayne@69 817 B [CurrCp->frameB] [0] = '\0';
jpayne@69 818 Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB],
jpayne@69 819 CurrCp->frameB );
jpayne@69 820 }
jpayne@69 821
jpayne@69 822 //-- Extend each match in the cluster
jpayne@69 823 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
jpayne@69 824 {
jpayne@69 825 //-- Try to extend the current match backwards
jpayne@69 826 if ( target_reached )
jpayne@69 827 {
jpayne@69 828 //-- Merge with the previous match
jpayne@69 829 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
jpayne@69 830 {
jpayne@69 831 //-- Strip matches until the targeted match is found
jpayne@69 832 assert (Mp < CurrCp->matches.end( ) - 1);
jpayne@69 833 continue;
jpayne@69 834 }
jpayne@69 835
jpayne@69 836 CurrAp->eA += Mp->len - 1;
jpayne@69 837 CurrAp->eB += Mp->len - 1;
jpayne@69 838 }
jpayne@69 839 else
jpayne@69 840 {
jpayne@69 841 //-- Create a new alignment object
jpayne@69 842 addNewAlignment (Alignments, CurrCp, Mp);
jpayne@69 843 CurrAp = Alignments.end( ) - 1;
jpayne@69 844
jpayne@69 845 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
jpayne@69 846 {
jpayne@69 847 //-- Target the closest/best alignment object
jpayne@69 848 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
jpayne@69 849
jpayne@69 850 //-- Extend the new alignment object backwards
jpayne@69 851 if ( extendBackward (Alignments, CurrAp, TargetAp,
jpayne@69 852 A [CurrCp->frameA], B [CurrCp->frameB]) )
jpayne@69 853 CurrAp = TargetAp;
jpayne@69 854 }
jpayne@69 855 }
jpayne@69 856
jpayne@69 857 m_o = FORWARD_ALIGN;
jpayne@69 858
jpayne@69 859 //-- Try to extend the current match forwards
jpayne@69 860 if ( Mp < CurrCp->matches.end( ) - 1 )
jpayne@69 861 {
jpayne@69 862 //-- Target the next match in the cluster
jpayne@69 863 targetA = (Mp + 1)->sA;
jpayne@69 864 targetB = (Mp + 1)->sB;
jpayne@69 865
jpayne@69 866 //-- Extend the current alignment object forward
jpayne@69 867 target_reached = extendForward (CurrAp,
jpayne@69 868 A [CurrCp->frameA], targetA,
jpayne@69 869 B [CurrCp->frameB], targetB, m_o);
jpayne@69 870 }
jpayne@69 871 else if ( DO_EXTEND )
jpayne@69 872 {
jpayne@69 873 targetA = Alen [CurrCp->frameA];
jpayne@69 874 targetB = Blen [CurrCp->frameB];
jpayne@69 875
jpayne@69 876 //-- Target the closest/best match in a future cluster
jpayne@69 877 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
jpayne@69 878 targetA, targetB);
jpayne@69 879
jpayne@69 880 if ( TargetCp == Clusters.end( ) )
jpayne@69 881 {
jpayne@69 882 m_o |= OPTIMAL_BIT;
jpayne@69 883 if ( TO_SEQEND )
jpayne@69 884 m_o |= SEQEND_BIT;
jpayne@69 885 }
jpayne@69 886
jpayne@69 887 //-- Extend the current alignment object forward
jpayne@69 888 target_reached = extendForward (CurrAp,
jpayne@69 889 A [CurrCp->frameA], targetA,
jpayne@69 890 B [CurrCp->frameB], targetB, m_o);
jpayne@69 891 }
jpayne@69 892 }
jpayne@69 893
jpayne@69 894 if ( TargetCp == Clusters.end( ) )
jpayne@69 895 target_reached = false;
jpayne@69 896
jpayne@69 897 CurrCp->wasFused = true;
jpayne@69 898
jpayne@69 899 if ( target_reached == false )
jpayne@69 900 CurrCp = ++ PrevCp;
jpayne@69 901 else
jpayne@69 902 CurrCp = TargetCp;
jpayne@69 903 }
jpayne@69 904
jpayne@69 905 #ifdef _DEBUG_ASSERT
jpayne@69 906 validateData (Alignments, Clusters, Af, Bf);
jpayne@69 907 #endif
jpayne@69 908
jpayne@69 909 //-- Output the alignment data to the delta file
jpayne@69 910 flushAlignments (Alignments, Af, Bf, DeltaFile);
jpayne@69 911
jpayne@69 912 for ( i = 0; i < 7; i ++ )
jpayne@69 913 {
jpayne@69 914 if ( A [i] != NULL )
jpayne@69 915 free ( A [i] );
jpayne@69 916 if ( B [i] != NULL )
jpayne@69 917 free ( B [i] );
jpayne@69 918 }
jpayne@69 919
jpayne@69 920 return;
jpayne@69 921 }
jpayne@69 922
jpayne@69 923
jpayne@69 924
jpayne@69 925
jpayne@69 926 void flushAlignments
jpayne@69 927 (vector<Alignment> & Alignments,
jpayne@69 928 const FastaRecord * Af, const FastaRecord * Bf,
jpayne@69 929 FILE * DeltaFile)
jpayne@69 930
jpayne@69 931 // Simply output the delta information stored in Alignments to the
jpayne@69 932 // given delta file. Free the memory used by Alignments once the
jpayne@69 933 // data is successfully output to the file.
jpayne@69 934
jpayne@69 935 {
jpayne@69 936 vector<Alignment>::iterator Ap; // alignment pointer
jpayne@69 937 vector<long int>::iterator Dp; // delta pointer
jpayne@69 938
jpayne@69 939 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
jpayne@69 940
jpayne@69 941 //-- Generate the error counts
jpayne@69 942 parseDelta (Alignments, Af, Bf);
jpayne@69 943
jpayne@69 944 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 945 {
jpayne@69 946 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
jpayne@69 947 transC (Ap->sA, Ap->frameA, Af->len),
jpayne@69 948 transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2),
jpayne@69 949 transC (Ap->sB, Ap->frameB, Bf->len),
jpayne@69 950 transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2),
jpayne@69 951 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
jpayne@69 952
jpayne@69 953 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
jpayne@69 954 fprintf (DeltaFile, "%ld\n", *Dp);
jpayne@69 955 fprintf (DeltaFile, "0\n");
jpayne@69 956
jpayne@69 957 Ap->delta.clear( );
jpayne@69 958 }
jpayne@69 959
jpayne@69 960 Alignments.clear( );
jpayne@69 961
jpayne@69 962 return;
jpayne@69 963 }
jpayne@69 964
jpayne@69 965
jpayne@69 966
jpayne@69 967
jpayne@69 968 void flushSyntenys
jpayne@69 969 (vector<Synteny> & Syntenys, FILE * ClusterFile)
jpayne@69 970
jpayne@69 971 // Simply output the synteny/cluster information generated by the mgaps
jpayne@69 972 // program. However, now the coordinates reference their appropriate
jpayne@69 973 // reference sequence, and the reference sequecne header is added to
jpayne@69 974 // the appropriate lines. Free the memory used by Syntenys once the
jpayne@69 975 // data is successfully output to the file.
jpayne@69 976
jpayne@69 977 {
jpayne@69 978 vector<Synteny>::iterator Sp; // synteny pointer
jpayne@69 979 vector<Cluster>::iterator Cp; // cluster pointer
jpayne@69 980 vector<Match>::iterator Mp; // match pointer
jpayne@69 981
jpayne@69 982 if ( ClusterFile )
jpayne@69 983 {
jpayne@69 984 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
jpayne@69 985 {
jpayne@69 986 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
jpayne@69 987 Sp->AfP->len, Sp->Bf.len);
jpayne@69 988 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
jpayne@69 989 {
jpayne@69 990 fprintf (ClusterFile, "%2d %2d\n",
jpayne@69 991 Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA,
jpayne@69 992 Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB);
jpayne@69 993 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
jpayne@69 994 {
jpayne@69 995 fprintf (ClusterFile, "%8ld %8ld %6ld",
jpayne@69 996 transC (Mp->sA, Cp->frameA, Sp->AfP->len),
jpayne@69 997 transC (Mp->sB, Cp->frameB, Sp->Bf.len),
jpayne@69 998 Mp->len * 3);
jpayne@69 999
jpayne@69 1000 if ( Mp != Cp->matches.begin( ) )
jpayne@69 1001 fprintf (ClusterFile, "%6ld %6ld\n",
jpayne@69 1002 (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3,
jpayne@69 1003 (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3);
jpayne@69 1004 else
jpayne@69 1005 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
jpayne@69 1006 }
jpayne@69 1007 Cp->matches.clear( );
jpayne@69 1008 }
jpayne@69 1009 Sp->clusters.clear( );
jpayne@69 1010 }
jpayne@69 1011 }
jpayne@69 1012 else
jpayne@69 1013 {
jpayne@69 1014 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
jpayne@69 1015 {
jpayne@69 1016 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
jpayne@69 1017 {
jpayne@69 1018 Cp->matches.clear( );
jpayne@69 1019 }
jpayne@69 1020 Sp->clusters.clear( );
jpayne@69 1021 }
jpayne@69 1022 }
jpayne@69 1023
jpayne@69 1024 Syntenys.clear( );
jpayne@69 1025
jpayne@69 1026 return;
jpayne@69 1027 }
jpayne@69 1028
jpayne@69 1029
jpayne@69 1030
jpayne@69 1031
jpayne@69 1032
jpayne@69 1033 vector<Cluster>::iterator getForwardTargetCluster
jpayne@69 1034 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
jpayne@69 1035 long int & targetA, long int & targetB)
jpayne@69 1036
jpayne@69 1037 // Return the cluster that is most likely to successfully join (in a
jpayne@69 1038 // forward direction) with the current cluster. The returned cluster
jpayne@69 1039 // must contain 1 or more matches that are strictly greater than the end
jpayne@69 1040 // of the current cluster. The targeted cluster must also be on a
jpayne@69 1041 // diagonal close enough to the current cluster, so that a connection
jpayne@69 1042 // could possibly be made by the alignment extender. Also, this targeted
jpayne@69 1043 // cluster must be consistent in frame with the current cluster. Assumes
jpayne@69 1044 // clusters have been sorted via AscendingClusterSort. Returns targeted
jpayne@69 1045 // cluster and stores the target coordinates in targetA and targetB. If no
jpayne@69 1046 // suitable cluster was found, the function will return NULL and target
jpayne@69 1047 // A and targetB will remain unchanged.
jpayne@69 1048
jpayne@69 1049 {
jpayne@69 1050 vector<Match>::iterator Mip; // match iteratrive pointer
jpayne@69 1051 vector<Cluster>::iterator Cp; // cluster pointer
jpayne@69 1052 vector<Cluster>::iterator Cip; // cluster iterative pointer
jpayne@69 1053 long int eA, eB; // possible target
jpayne@69 1054 long int greater, lesser; // gap sizes between two clusters
jpayne@69 1055 long int sA = CurrCp->matches.rbegin( )->sA +
jpayne@69 1056 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
jpayne@69 1057 long int sB = CurrCp->matches.rbegin( )->sB +
jpayne@69 1058 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
jpayne@69 1059
jpayne@69 1060 //-- End of sequences is the default target, set distance accordingly
jpayne@69 1061 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
jpayne@69 1062
jpayne@69 1063 //-- For all clusters greater than the current cluster (on sequence A)
jpayne@69 1064 Cp = Clusters.end( );
jpayne@69 1065 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
jpayne@69 1066 {
jpayne@69 1067 //-- If the cluster is in the same frame
jpayne@69 1068 if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB )
jpayne@69 1069 {
jpayne@69 1070 eA = Cip->matches.begin( )->sA;
jpayne@69 1071 eB = Cip->matches.begin( )->sB;
jpayne@69 1072
jpayne@69 1073 //-- If the cluster overlaps the current cluster, strip some matches
jpayne@69 1074 if ( ( eA < sA || eB < sB ) &&
jpayne@69 1075 Cip->matches.rbegin( )->sA >= sA &&
jpayne@69 1076 Cip->matches.rbegin( )->sB >= sB )
jpayne@69 1077 {
jpayne@69 1078 for ( Mip = Cip->matches.begin( );
jpayne@69 1079 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
jpayne@69 1080 Mip ++ )
jpayne@69 1081 {
jpayne@69 1082 eA = Mip->sA;
jpayne@69 1083 eB = Mip->sB;
jpayne@69 1084 }
jpayne@69 1085 }
jpayne@69 1086
jpayne@69 1087 //-- If the cluster is strictly greater than current cluster
jpayne@69 1088 if ( eA >= sA && eB >= sB )
jpayne@69 1089 {
jpayne@69 1090 if ( eA - sA > eB - sB )
jpayne@69 1091 {
jpayne@69 1092 greater = eA - sA;
jpayne@69 1093 lesser = eB - sB;
jpayne@69 1094 }
jpayne@69 1095 else
jpayne@69 1096 {
jpayne@69 1097 lesser = eA - sA;
jpayne@69 1098 greater = eB - sB;
jpayne@69 1099 }
jpayne@69 1100
jpayne@69 1101 //-- If the cluster is close enough
jpayne@69 1102 if ( greater <= getBreakLen( ) ||
jpayne@69 1103 (lesser) * GOOD_SCORE [getMatrixType( )] +
jpayne@69 1104 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
jpayne@69 1105 {
jpayne@69 1106 Cp = Cip;
jpayne@69 1107 targetA = eA;
jpayne@69 1108 targetB = eB;
jpayne@69 1109 break;
jpayne@69 1110 }
jpayne@69 1111 else if ( (greater << 1) - lesser < dist )
jpayne@69 1112 {
jpayne@69 1113 Cp = Cip;
jpayne@69 1114 targetA = eA;
jpayne@69 1115 targetB = eB;
jpayne@69 1116 dist = (greater << 1) - lesser;
jpayne@69 1117 }
jpayne@69 1118 }
jpayne@69 1119 }
jpayne@69 1120 }
jpayne@69 1121
jpayne@69 1122
jpayne@69 1123 return Cp;
jpayne@69 1124 }
jpayne@69 1125
jpayne@69 1126
jpayne@69 1127
jpayne@69 1128
jpayne@69 1129
jpayne@69 1130 vector<Alignment>::iterator getReverseTargetAlignment
jpayne@69 1131 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
jpayne@69 1132
jpayne@69 1133 // Return the alignment that is most likely to successfully join (in a
jpayne@69 1134 // reverse direction) with the current alignment. The returned alignment
jpayne@69 1135 // must be strictly less than the current cluster and be on a diagonal
jpayne@69 1136 // close enough to the current alignment, so that a connection
jpayne@69 1137 // could possibly be made by the alignment extender. Also, this targeted
jpayne@69 1138 // alignment must be consistent in frame with the current alignment.
jpayne@69 1139 // Assumes clusters have been sorted via AscendingClusterSort and
jpayne@69 1140 // processed in order, so therefore all alignments are in order by their
jpayne@69 1141 // start A coordinate.
jpayne@69 1142
jpayne@69 1143 {
jpayne@69 1144 vector<Alignment>::iterator Ap; // alignment pointer
jpayne@69 1145 vector<Alignment>::iterator Aip; // alignment iterative pointer
jpayne@69 1146 long int eA, eB; // possible targets
jpayne@69 1147 long int greater, lesser; // gap sizes between the two alignments
jpayne@69 1148 long int sA = CurrAp->sA; // the startA of the current alignment
jpayne@69 1149 long int sB = CurrAp->sB; // the startB of the current alignment
jpayne@69 1150
jpayne@69 1151 //-- Beginning of sequences is the default target, set distance accordingly
jpayne@69 1152 long int dist = (sA < sB ? sA : sB);
jpayne@69 1153
jpayne@69 1154 //-- For all alignments less than the current alignment (on sequence A)
jpayne@69 1155 Ap = Alignments.end( );
jpayne@69 1156 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
jpayne@69 1157 {
jpayne@69 1158 //-- If the alignment is on the same direction
jpayne@69 1159 if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB )
jpayne@69 1160 {
jpayne@69 1161 eA = Aip->eA;
jpayne@69 1162 eB = Aip->eB;
jpayne@69 1163
jpayne@69 1164 //-- If the alignment is strictly greater than current cluster
jpayne@69 1165 if ( eA <= sA && eB <= sB )
jpayne@69 1166 {
jpayne@69 1167 if ( sA - eA > sB - eB )
jpayne@69 1168 {
jpayne@69 1169 greater = sA - eA;
jpayne@69 1170 lesser = sB - eB;
jpayne@69 1171 }
jpayne@69 1172 else
jpayne@69 1173 {
jpayne@69 1174 lesser = sA - eA;
jpayne@69 1175 greater = sB - eB;
jpayne@69 1176 }
jpayne@69 1177
jpayne@69 1178 //-- If the cluster is close enough
jpayne@69 1179 if ( greater <= getBreakLen( ) ||
jpayne@69 1180 (lesser) * GOOD_SCORE [getMatrixType( )] +
jpayne@69 1181 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 )
jpayne@69 1182 {
jpayne@69 1183 Ap = Aip;
jpayne@69 1184 break;
jpayne@69 1185 }
jpayne@69 1186 else if ( (greater << 1) - lesser < dist )
jpayne@69 1187 {
jpayne@69 1188 Ap = Aip;
jpayne@69 1189 dist = (greater << 1) - lesser;
jpayne@69 1190 }
jpayne@69 1191 }
jpayne@69 1192 }
jpayne@69 1193 }
jpayne@69 1194
jpayne@69 1195
jpayne@69 1196 return Ap;
jpayne@69 1197 }
jpayne@69 1198
jpayne@69 1199
jpayne@69 1200
jpayne@69 1201
jpayne@69 1202
jpayne@69 1203 bool isShadowedCluster
jpayne@69 1204 (vector<Cluster>::iterator CurrCp,
jpayne@69 1205 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
jpayne@69 1206
jpayne@69 1207 // Check if the current cluster is shadowed by a previously produced
jpayne@69 1208 // alignment region. Return true if it is, else false.
jpayne@69 1209
jpayne@69 1210 {
jpayne@69 1211 vector<Alignment>::iterator Aip; // alignment pointer
jpayne@69 1212
jpayne@69 1213 long int sA = CurrCp->matches.begin( )->sA;
jpayne@69 1214 long int eA = CurrCp->matches.rbegin( )->sA +
jpayne@69 1215 CurrCp->matches.rbegin( )->len - 1;
jpayne@69 1216 long int sB = CurrCp->matches.begin( )->sB;
jpayne@69 1217 long int eB = CurrCp->matches.rbegin( )->sB +
jpayne@69 1218 CurrCp->matches.rbegin( )->len - 1;
jpayne@69 1219
jpayne@69 1220 if ( ! Alignments.empty( ) ) // if there are alignments to use
jpayne@69 1221 {
jpayne@69 1222 //-- Look backwards in hope of finding a shadowing alignment
jpayne@69 1223 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
jpayne@69 1224 {
jpayne@69 1225 //-- If in the same frames and shadowing the current cluster, break
jpayne@69 1226 if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB )
jpayne@69 1227 if ( Aip->eA >= eA && Aip->eB >= eB )
jpayne@69 1228 if ( Aip->sA <= sA && Aip->sB <= sB )
jpayne@69 1229 break;
jpayne@69 1230 }
jpayne@69 1231
jpayne@69 1232 //-- Return true if the loop was broken, i.e. shadow found
jpayne@69 1233 if ( Aip >= Alignments.begin( ) )
jpayne@69 1234 return true;
jpayne@69 1235 }
jpayne@69 1236
jpayne@69 1237 //-- Return false if Alignments was empty or loop was not broken
jpayne@69 1238 return false;
jpayne@69 1239 }
jpayne@69 1240
jpayne@69 1241
jpayne@69 1242
jpayne@69 1243
jpayne@69 1244
jpayne@69 1245
jpayne@69 1246 void parseAbort
jpayne@69 1247 (const char * s)
jpayne@69 1248
jpayne@69 1249 // Abort the program if there was an error in parsing file 's'
jpayne@69 1250
jpayne@69 1251 {
jpayne@69 1252 fprintf (stderr,
jpayne@69 1253 "ERROR: Could not parse input from '%s'. \n"
jpayne@69 1254 "Please check the filename and format, or file a bug report\n", s);
jpayne@69 1255 exit (EXIT_FAILURE);
jpayne@69 1256 }
jpayne@69 1257
jpayne@69 1258
jpayne@69 1259
jpayne@69 1260
jpayne@69 1261 void parseDelta
jpayne@69 1262 (vector<Alignment> & Alignments,
jpayne@69 1263 const FastaRecord * Af, const FastaRecord *Bf)
jpayne@69 1264
jpayne@69 1265 // Use the delta information to generate the error counts for each
jpayne@69 1266 // alignment, and fill this information into the data type
jpayne@69 1267
jpayne@69 1268 {
jpayne@69 1269 // pointers to Af.seq and Bf.seq
jpayne@69 1270 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 1271 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 1272
jpayne@69 1273 char ch1, ch2;
jpayne@69 1274 long int Delta;
jpayne@69 1275 int Sign;
jpayne@69 1276 long int i, Apos, Bpos;
jpayne@69 1277 long int Remain, Total;
jpayne@69 1278 long int Errors, SimErrors;
jpayne@69 1279 long int NonAlphas;
jpayne@69 1280 vector<Alignment>::iterator Ap;
jpayne@69 1281 vector<long int>::iterator Dp;
jpayne@69 1282
jpayne@69 1283 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1284 {
jpayne@69 1285 if ( A [Ap->frameA] == NULL )
jpayne@69 1286 {
jpayne@69 1287 A [Ap->frameA] = (char *) Safe_malloc
jpayne@69 1288 ( sizeof(char) * ( (Af->len / 3) + 2 ) );
jpayne@69 1289 A [Ap->frameA] [0] = '\0';
jpayne@69 1290 Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA );
jpayne@69 1291 }
jpayne@69 1292 if ( B [Ap->frameB] == NULL )
jpayne@69 1293 {
jpayne@69 1294 B [Ap->frameB] = (char *) Safe_malloc
jpayne@69 1295 ( sizeof(char) * ( (Bf->len / 3) + 2) );
jpayne@69 1296 B [Ap->frameB] [0] = '\0';
jpayne@69 1297 Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB );
jpayne@69 1298 }
jpayne@69 1299
jpayne@69 1300 Apos = Ap->sA;
jpayne@69 1301 Bpos = Ap->sB;
jpayne@69 1302
jpayne@69 1303 Errors = 0;
jpayne@69 1304 SimErrors = 0;
jpayne@69 1305 NonAlphas = 0;
jpayne@69 1306 Remain = Ap->eA - Ap->sA + 1;
jpayne@69 1307 Total = Remain;
jpayne@69 1308
jpayne@69 1309 //-- For all delta's in this alignment
jpayne@69 1310 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
jpayne@69 1311 {
jpayne@69 1312 Delta = *Dp;
jpayne@69 1313 Sign = Delta > 0 ? 1 : -1;
jpayne@69 1314 Delta = labs ( Delta );
jpayne@69 1315
jpayne@69 1316 //-- For all the bases before the next indel
jpayne@69 1317 for ( i = 1; i < Delta; i ++ )
jpayne@69 1318 {
jpayne@69 1319 ch1 = A [Ap->frameA] [Apos ++];
jpayne@69 1320 ch2 = B [Ap->frameB] [Bpos ++];
jpayne@69 1321
jpayne@69 1322 if ( !isalpha (ch1) )
jpayne@69 1323 {
jpayne@69 1324 ch1 = STOP_CHAR;
jpayne@69 1325 NonAlphas ++;
jpayne@69 1326 }
jpayne@69 1327 if ( !isalpha (ch2) )
jpayne@69 1328 {
jpayne@69 1329 ch2 = STOP_CHAR;
jpayne@69 1330 NonAlphas ++;
jpayne@69 1331 }
jpayne@69 1332
jpayne@69 1333 ch1 = toupper(ch1);
jpayne@69 1334 ch2 = toupper(ch2);
jpayne@69 1335 if ( 1 > MATCH_SCORE
jpayne@69 1336 [getMatrixType( )]
jpayne@69 1337 [ch1 - 'A']
jpayne@69 1338 [ch2 - 'A'] )
jpayne@69 1339 SimErrors ++;
jpayne@69 1340 if ( ch1 != ch2 )
jpayne@69 1341 Errors ++;
jpayne@69 1342 }
jpayne@69 1343
jpayne@69 1344 //-- Process the current indel
jpayne@69 1345 Remain -= i - 1;
jpayne@69 1346 Errors ++;
jpayne@69 1347 SimErrors ++;
jpayne@69 1348
jpayne@69 1349 if ( Sign == 1 )
jpayne@69 1350 {
jpayne@69 1351 if ( !isalpha (A [Ap->frameA] [Apos ++]) )
jpayne@69 1352 NonAlphas ++;
jpayne@69 1353 Remain --;
jpayne@69 1354 }
jpayne@69 1355 else
jpayne@69 1356 {
jpayne@69 1357 if ( !isalpha (B [Ap->frameB] [Bpos ++]) )
jpayne@69 1358 NonAlphas ++;
jpayne@69 1359 Total ++;
jpayne@69 1360 }
jpayne@69 1361 }
jpayne@69 1362
jpayne@69 1363 //-- For all the bases after the final indel
jpayne@69 1364 for ( i = 0; i < Remain; i ++ )
jpayne@69 1365 {
jpayne@69 1366 //-- Score character match and update error counters
jpayne@69 1367 ch1 = A [Ap->frameA] [Apos ++];
jpayne@69 1368 ch2 = B [Ap->frameB] [Bpos ++];
jpayne@69 1369
jpayne@69 1370 if ( !isalpha (ch1) )
jpayne@69 1371 {
jpayne@69 1372 ch1 = STOP_CHAR;
jpayne@69 1373 NonAlphas ++;
jpayne@69 1374 }
jpayne@69 1375 if ( !isalpha (ch2) )
jpayne@69 1376 {
jpayne@69 1377 ch2 = STOP_CHAR;
jpayne@69 1378 NonAlphas ++;
jpayne@69 1379 }
jpayne@69 1380
jpayne@69 1381 ch1 = toupper(ch1);
jpayne@69 1382 ch2 = toupper(ch2);
jpayne@69 1383 if ( 1 > MATCH_SCORE
jpayne@69 1384 [getMatrixType( )]
jpayne@69 1385 [ch1 - 'A']
jpayne@69 1386 [ch2 - 'A'] )
jpayne@69 1387 SimErrors ++;
jpayne@69 1388 if ( ch1 != ch2 )
jpayne@69 1389 Errors ++;
jpayne@69 1390 }
jpayne@69 1391
jpayne@69 1392 Ap->Errors = Errors;
jpayne@69 1393 Ap->SimErrors = SimErrors;
jpayne@69 1394 Ap->NonAlphas = NonAlphas;
jpayne@69 1395 }
jpayne@69 1396
jpayne@69 1397 for ( i = 1; i <= 6; i ++ )
jpayne@69 1398 {
jpayne@69 1399 if ( A [i] != NULL )
jpayne@69 1400 free ( A [i] );
jpayne@69 1401 A [i] = NULL;
jpayne@69 1402
jpayne@69 1403 if ( B [i] != NULL )
jpayne@69 1404 free ( B [i] );
jpayne@69 1405 B [i] = NULL;
jpayne@69 1406 }
jpayne@69 1407
jpayne@69 1408 return;
jpayne@69 1409 }
jpayne@69 1410
jpayne@69 1411
jpayne@69 1412
jpayne@69 1413
jpayne@69 1414 void processSyntenys
jpayne@69 1415 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
jpayne@69 1416 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
jpayne@69 1417
jpayne@69 1418 // For each syntenic region with clusters, read in the B sequence and
jpayne@69 1419 // extend the clusters to expand total alignment coverage. Clusters should
jpayne@69 1420 // still reference the amino acid concatenations, the translation back to
jpayne@69 1421 // DNA will be taken care of by the extendClusters function.
jpayne@69 1422 // processSyntenys should ONLY be called once all the clusters for
jpayne@69 1423 // the contained syntenic regions have been stored in the data structure.
jpayne@69 1424 // Frees the memory used by the the syntenic regions once the output of
jpayne@69 1425 // extendClusters and flushSyntenys has been produced.
jpayne@69 1426
jpayne@69 1427 {
jpayne@69 1428 FastaRecord Bf; // the current B sequence
jpayne@69 1429 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
jpayne@69 1430
jpayne@69 1431 long int InitSize = INIT_SIZE; // the initial memory capacity of B
jpayne@69 1432 vector<Synteny>::iterator CurrSp; // current synteny pointer
jpayne@69 1433
jpayne@69 1434 //-- Initialize the B sequence storage
jpayne@69 1435 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 1436
jpayne@69 1437 //-- For all the contained syntenys
jpayne@69 1438 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
jpayne@69 1439 {
jpayne@69 1440 //-- If no clusters, ignore
jpayne@69 1441 if ( CurrSp->clusters.empty( ) )
jpayne@69 1442 continue;
jpayne@69 1443
jpayne@69 1444 //-- If a B sequence not seen yet, read it in
jpayne@69 1445 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
jpayne@69 1446 // ordered as output by mgaps, if they are not in order the program
jpayne@69 1447 // will fail. (All like tags must be adjacent and in the same order
jpayne@69 1448 // as the query file)
jpayne@69 1449 if ( CurrSp == Syntenys.begin( ) ||
jpayne@69 1450 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
jpayne@69 1451 {
jpayne@69 1452 //-- Read in the B sequence
jpayne@69 1453 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
jpayne@69 1454 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
jpayne@69 1455 break;
jpayne@69 1456 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
jpayne@69 1457 {
jpayne@69 1458 fprintf(stderr,"%s\n", CurrSp->Bf.Id);
jpayne@69 1459 parseAbort ( "Query File" );
jpayne@69 1460 }
jpayne@69 1461 Bf.len = strlen (Bf.seq + 1);
jpayne@69 1462 }
jpayne@69 1463
jpayne@69 1464 //-- Extend clusters and create the alignment information
jpayne@69 1465 CurrSp->Bf.len = Bf.len;
jpayne@69 1466 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
jpayne@69 1467 }
jpayne@69 1468
jpayne@69 1469 //-- Create the cluster information and clear the Synteny structure
jpayne@69 1470 flushSyntenys (Syntenys, ClusterFile);
jpayne@69 1471
jpayne@69 1472 free (Bf.Id);
jpayne@69 1473 free (Bf.seq);
jpayne@69 1474
jpayne@69 1475 return;
jpayne@69 1476 }
jpayne@69 1477
jpayne@69 1478
jpayne@69 1479
jpayne@69 1480 inline long int transC
jpayne@69 1481 (long int Coord, int Frame, long int Len)
jpayne@69 1482
jpayne@69 1483 // Translate an amino acid coordinate to a nucleotide coordinate
jpayne@69 1484 // relative to the forward strand. The Len parameter should be the
jpayne@69 1485 // length of the original DNA strand.
jpayne@69 1486
jpayne@69 1487 {
jpayne@69 1488 assert ( Frame > 0 && Frame < 7 );
jpayne@69 1489 if ( Frame > 3 )
jpayne@69 1490 return revC ( (Coord * 3) - (3 - (Frame - 3)), Len );
jpayne@69 1491 else
jpayne@69 1492 return (Coord * 3) - (3 - (Frame));
jpayne@69 1493 }
jpayne@69 1494
jpayne@69 1495
jpayne@69 1496
jpayne@69 1497
jpayne@69 1498 inline long int refLen
jpayne@69 1499 (long int ntLen)
jpayne@69 1500
jpayne@69 1501 // Return the length of the concatenated amino acid sequence frames,
jpayne@69 1502 // including the appended 'X' at the end of each sequence frame for the
jpayne@69 1503 // given DNA input length. (The returned length will be the length of
jpayne@69 1504 // the concatenated frames for this sequence as output by prepro.cc)
jpayne@69 1505
jpayne@69 1506 {
jpayne@69 1507 return (ntLen + 1) << 1;
jpayne@69 1508 }
jpayne@69 1509
jpayne@69 1510
jpayne@69 1511
jpayne@69 1512
jpayne@69 1513 inline long int revC
jpayne@69 1514 (long int Coord, long int Len)
jpayne@69 1515
jpayne@69 1516 // Reverse complement the given coordinate for the given length.
jpayne@69 1517
jpayne@69 1518 {
jpayne@69 1519 assert (Len - Coord + 1 > 0);
jpayne@69 1520 return (Len - Coord + 1);
jpayne@69 1521 }
jpayne@69 1522
jpayne@69 1523
jpayne@69 1524
jpayne@69 1525
jpayne@69 1526 void printHelp
jpayne@69 1527 (const char * s)
jpayne@69 1528
jpayne@69 1529 // Display the program's help information to stderr.
jpayne@69 1530
jpayne@69 1531 {
jpayne@69 1532 fprintf (stderr,
jpayne@69 1533 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
jpayne@69 1534 fprintf (stderr,
jpayne@69 1535 "-b int set the alignment break (give-up) length to int (amino acids)\n"
jpayne@69 1536 "-d output only match clusters rather than extended alignments\n"
jpayne@69 1537 "-e do not extend alignments outward from clusters\n"
jpayne@69 1538 "-h display help information\n"
jpayne@69 1539 "-t force alignment to ends of sequence if within -b distance\n"
jpayne@69 1540 "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n"
jpayne@69 1541 " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n");
jpayne@69 1542 fprintf (stderr,
jpayne@69 1543 " Input is the output of the \"mgaps\" program from stdin, and\n"
jpayne@69 1544 "the two original PROmer sequence files passed on the command\n"
jpayne@69 1545 "line. <pfx> is the prefix to be added to the front of the\n"
jpayne@69 1546 "output file <pfx>.delta\n"
jpayne@69 1547 " <pfx>.delta is the alignment object that catalogs the distance\n"
jpayne@69 1548 "between insertions and deletions. For further information\n"
jpayne@69 1549 "regarding this file, please refer to the documentation under\n"
jpayne@69 1550 "the .delta output description.\n\n");
jpayne@69 1551 return;
jpayne@69 1552 }
jpayne@69 1553
jpayne@69 1554
jpayne@69 1555
jpayne@69 1556
jpayne@69 1557 void printUsage
jpayne@69 1558 (const char * s)
jpayne@69 1559
jpayne@69 1560 // Display the program's usage information to stderr.
jpayne@69 1561
jpayne@69 1562 {
jpayne@69 1563 fprintf (stderr,
jpayne@69 1564 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
jpayne@69 1565 fprintf (stderr, "Try '%s -h' for more information.\n", s);
jpayne@69 1566 return;
jpayne@69 1567 }
jpayne@69 1568
jpayne@69 1569
jpayne@69 1570
jpayne@69 1571
jpayne@69 1572 void validateData
jpayne@69 1573 (vector<Alignment> Alignments, vector<Cluster> Clusters,
jpayne@69 1574 const FastaRecord * Af, const FastaRecord * Bf)
jpayne@69 1575
jpayne@69 1576 // Self test function to check that the cluster and alignment information
jpayne@69 1577 // is valid
jpayne@69 1578
jpayne@69 1579 {
jpayne@69 1580 char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 1581 char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
jpayne@69 1582
jpayne@69 1583 long int Alen [7], Blen [7]; // the length of the amino acid sequences
jpayne@69 1584 vector<Cluster>::iterator Cp;
jpayne@69 1585 vector<Match>::iterator Mp;
jpayne@69 1586 vector<Alignment>::iterator Ap;
jpayne@69 1587 long int x, y, i;
jpayne@69 1588 char Xc, Yc;
jpayne@69 1589
jpayne@69 1590 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
jpayne@69 1591 {
jpayne@69 1592 assert ( Cp->wasFused );
jpayne@69 1593
jpayne@69 1594 //-- Initialize the right amino acid sequence for A and B if need be
jpayne@69 1595 if ( A [Cp->frameA] == NULL )
jpayne@69 1596 {
jpayne@69 1597 A [Cp->frameA] = (char *) Safe_malloc
jpayne@69 1598 ( sizeof(char) * ( (Af->len / 3) + 2) );
jpayne@69 1599 A [Cp->frameA] [0] = '\0';
jpayne@69 1600 Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA],
jpayne@69 1601 Cp->frameA );
jpayne@69 1602 }
jpayne@69 1603 if ( B [Cp->frameB] == NULL )
jpayne@69 1604 {
jpayne@69 1605 B [Cp->frameB] = (char *) Safe_malloc
jpayne@69 1606 ( sizeof(char) * ( (Bf->len / 3) + 2) );
jpayne@69 1607 B [Cp->frameB] [0] = '\0';
jpayne@69 1608 Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB],
jpayne@69 1609 Cp->frameB );
jpayne@69 1610 }
jpayne@69 1611
jpayne@69 1612 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
jpayne@69 1613 {
jpayne@69 1614 //-- assert for each match in cluster, it is indeed a match
jpayne@69 1615 x = Mp->sA;
jpayne@69 1616 y = Mp->sB;
jpayne@69 1617 for ( i = 0; i < Mp->len; i ++ )
jpayne@69 1618 assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] );
jpayne@69 1619
jpayne@69 1620 //-- assert for each match in cluster, it is contained in an alignment
jpayne@69 1621 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1622 {
jpayne@69 1623 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
jpayne@69 1624 Ap->eA >= Mp->sA + Mp->len - 1 &&
jpayne@69 1625 Ap->eB >= Mp->sB + Mp->len - 1 )
jpayne@69 1626 break;
jpayne@69 1627 }
jpayne@69 1628 assert ( Ap < Alignments.end( ) );
jpayne@69 1629 }
jpayne@69 1630 }
jpayne@69 1631
jpayne@69 1632 //-- assert alignments are optimal (quick check if first and last chars equal)
jpayne@69 1633 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1634 {
jpayne@69 1635 assert ( Ap->sA <= Ap->eA );
jpayne@69 1636 assert ( Ap->sB <= Ap->eB );
jpayne@69 1637
jpayne@69 1638 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
jpayne@69 1639 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
jpayne@69 1640 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
jpayne@69 1641 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
jpayne@69 1642
jpayne@69 1643 Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ?
jpayne@69 1644 A[Ap->frameA][Ap->sA] : STOP_CHAR);
jpayne@69 1645 Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ?
jpayne@69 1646 B[Ap->frameB][Ap->sB] : STOP_CHAR);
jpayne@69 1647 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
jpayne@69 1648
jpayne@69 1649 Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ?
jpayne@69 1650 A[Ap->frameA][Ap->eA] : STOP_CHAR);
jpayne@69 1651 Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ?
jpayne@69 1652 B[Ap->frameB][Ap->eB] : STOP_CHAR);
jpayne@69 1653 assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] );
jpayne@69 1654 }
jpayne@69 1655
jpayne@69 1656 for ( i = 0; i < 7; i ++ )
jpayne@69 1657 {
jpayne@69 1658 if ( A [i] != NULL )
jpayne@69 1659 free ( A [i] );
jpayne@69 1660 if ( B [i] != NULL )
jpayne@69 1661 free ( B [i] );
jpayne@69 1662 }
jpayne@69 1663
jpayne@69 1664 return;
jpayne@69 1665 }